In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
import re
import scipy.stats as stats
import math

# set the graphs to show in the jupyter notebook
%matplotlib inline

# set seabor graphs to a better style
sns.set(style="ticks")

### Business Problem - 1

#### EDA

In [80]:
loan_data = pd.read_csv('D:/python/proj_7/LoansData.csv')

In [81]:
loan_data.dtypes

Amount.Requested                  float64
Amount.Funded.By.Investors        float64
Interest.Rate                      object
Loan.Length                        object
Loan.Purpose                       object
Debt.To.Income.Ratio               object
State                              object
Home.Ownership                     object
Monthly.Income                    float64
FICO.Range                         object
Open.CREDIT.Lines                 float64
Revolving.CREDIT.Balance          float64
Inquiries.in.the.Last.6.Months    float64
Employment.Length                  object
dtype: object

In [82]:
loan_data['Loan.Length'].value_counts()

36 months    1952
60 months     548
Name: Loan.Length, dtype: int64

In [83]:
loan_data.head(2)

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,20000.0,20000.0,8.90%,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year
1,19200.0,19200.0,12.12%,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years


In [84]:
loan_data['Loan.Purpose'].value_counts()

debt_consolidation    1307
credit_card            444
other                  201
home_improvement       152
major_purchase         101
small_business          87
car                     50
wedding                 39
medical                 30
moving                  29
vacation                21
house                   20
educational             15
renewable_energy         4
Name: Loan.Purpose, dtype: int64

In [85]:
loan_data.shape

(2500, 14)

In [86]:
loan_data['Interest.Rate'] = loan_data['Interest.Rate'].str.replace('%','').astype('float')

In [87]:
loan_data['Interest.Rate'] = loan_data['Interest.Rate'].apply(lambda i:pd.Series(i/100))

In [88]:
loan_data['Interest.Rate'].mean()

0.130653883106485

In [89]:
loan_data['Interest.Rate'].isna().sum()

2

In [90]:
loan_data.isna().sum()

Amount.Requested                   1
Amount.Funded.By.Investors         1
Interest.Rate                      2
Loan.Length                        0
Loan.Purpose                       0
Debt.To.Income.Ratio               1
State                              0
Home.Ownership                     1
Monthly.Income                     1
FICO.Range                         2
Open.CREDIT.Lines                  3
Revolving.CREDIT.Balance           3
Inquiries.in.the.Last.6.Months     3
Employment.Length                 77
dtype: int64

In [91]:
loan_data['Debt.To.Income.Ratio'] = loan_data['Debt.To.Income.Ratio'].str.replace('%','').astype('float')

In [92]:
loan_data[['fico_low' , 'fico_high']] = loan_data['FICO.Range'].apply(lambda  i: pd.Series(str(i).split('-')))

In [93]:
loan_data.head(3)

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,fico_low,fico_high
0,20000.0,20000.0,0.089,36 months,debt_consolidation,14.9,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year,735,739
1,19200.0,19200.0,0.1212,36 months,debt_consolidation,28.36,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years,715,719
2,35000.0,35000.0,0.2198,60 months,debt_consolidation,23.81,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years,690,694


In [94]:
loan_data['fico_low'] = loan_data['fico_low'].astype('float')

In [95]:
loan_data['fico_high'] = loan_data['fico_high'].astype('float')

In [96]:
loan_data['fico_avg'] = (loan_data['fico_low'] + loan_data['fico_high']) / 2

In [97]:
loan_data.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,fico_low,fico_high,fico_avg
0,20000.0,20000.0,0.089,36 months,debt_consolidation,14.9,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year,735.0,739.0,737.0
1,19200.0,19200.0,0.1212,36 months,debt_consolidation,28.36,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years,715.0,719.0,717.0
2,35000.0,35000.0,0.2198,60 months,debt_consolidation,23.81,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years,690.0,694.0,692.0
3,10000.0,9975.0,0.0999,36 months,debt_consolidation,14.3,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years,695.0,699.0,697.0
4,12000.0,12000.0,0.1171,36 months,credit_card,18.78,NJ,RENT,3195.0,695-699,11.0,14469.0,0.0,9 years,695.0,699.0,697.0


In [98]:
loan_data['Inquiries.in.the.Last.6.Months'] = loan_data['Inquiries.in.the.Last.6.Months'].astype('str')

### a) Interest rate is varied for different loan amounts?

#### Dependent(paired) T test

In [99]:
stats.ttest_rel(a=loan_data['Amount.Funded.By.Investors'] , b=loan_data['Interest.Rate'])

Ttest_relResult(statistic=nan, pvalue=nan)

In [100]:
loan_data.corr(method='pearson').iloc[2,1]

0.3374540043675973

In [102]:
loan_data.corr(method='pearson')

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Debt.To.Income.Ratio,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,fico_low,fico_high,fico_avg
Amount.Requested,1.0,0.969843,0.332454,0.081923,0.389527,0.195694,0.295064,0.083221,0.083221,0.083221
Amount.Funded.By.Investors,0.969843,1.0,0.337454,0.094383,0.372339,0.185346,0.263499,0.074036,0.074036,0.074036
Interest.Rate,0.332454,0.337454,1.0,0.173442,0.011968,0.090546,0.061746,-0.709035,-0.709035,-0.709035
Debt.To.Income.Ratio,0.081923,0.094383,0.173442,1.0,-0.162906,0.370764,0.188644,-0.218322,-0.218322,-0.218322
Monthly.Income,0.389527,0.372339,0.011968,-0.162906,1.0,0.171558,0.359896,0.12198,0.12198,0.12198
Open.CREDIT.Lines,0.195694,0.185346,0.090546,0.370764,0.171558,1.0,0.289559,-0.088968,-0.088968,-0.088968
Revolving.CREDIT.Balance,0.295064,0.263499,0.061746,0.188644,0.359896,0.289559,1.0,0.002484,0.002484,0.002484
fico_low,0.083221,0.074036,-0.709035,-0.218322,0.12198,-0.088968,0.002484,1.0,1.0,1.0
fico_high,0.083221,0.074036,-0.709035,-0.218322,0.12198,-0.088968,0.002484,1.0,1.0,1.0
fico_avg,0.083221,0.074036,-0.709035,-0.218322,0.12198,-0.088968,0.002484,1.0,1.0,1.0


- The above pearson correlation method is showing that there is not much relation between Interest rate and loan amount.

### b) Loan length is directly effecting interest rate?

#### Independent(2 Sample) T test

In [145]:
ll36 = loan_data.loc[loan_data['Loan.Length'] == '36 months' , 'Interest.Rate'].dropna()

In [146]:
ll60 = loan_data.loc[loan_data['Loan.Length'] == '60 months' , 'Interest.Rate'].dropna()

In [147]:
print('The mean of loan lenght with 36 months' , round(ll36.mean(),2))
print('The mean of loan lenght with 60 months' , round(ll60.mean(),2))

The mean of loan lenght with 36 months 0.12
The mean of loan lenght with 60 months 0.16


In [148]:
print('The Std deviation of loan lenght with 36 months' , round(ll36.std(),2))
print('The Std deviation of loan lenght with 60 months' , round(ll60.std(),2))

The Std deviation of loan lenght with 36 months 0.04
The Std deviation of loan lenght with 60 months 0.04


In [151]:
stats.ttest_ind(a=ll36 , b=ll60 , equal_var=True)

Ttest_indResult(statistic=-23.404324256626747, pvalue=1.0952664487978044e-109)

In [108]:
loan_data.corr(method='pearson')

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Debt.To.Income.Ratio,Monthly.Income,Open.CREDIT.Lines,Revolving.CREDIT.Balance,fico_low,fico_high,fico_avg
Amount.Requested,1.0,0.969843,0.332454,0.081923,0.389527,0.195694,0.295064,0.083221,0.083221,0.083221
Amount.Funded.By.Investors,0.969843,1.0,0.337454,0.094383,0.372339,0.185346,0.263499,0.074036,0.074036,0.074036
Interest.Rate,0.332454,0.337454,1.0,0.173442,0.011968,0.090546,0.061746,-0.709035,-0.709035,-0.709035
Debt.To.Income.Ratio,0.081923,0.094383,0.173442,1.0,-0.162906,0.370764,0.188644,-0.218322,-0.218322,-0.218322
Monthly.Income,0.389527,0.372339,0.011968,-0.162906,1.0,0.171558,0.359896,0.12198,0.12198,0.12198
Open.CREDIT.Lines,0.195694,0.185346,0.090546,0.370764,0.171558,1.0,0.289559,-0.088968,-0.088968,-0.088968
Revolving.CREDIT.Balance,0.295064,0.263499,0.061746,0.188644,0.359896,0.289559,1.0,0.002484,0.002484,0.002484
fico_low,0.083221,0.074036,-0.709035,-0.218322,0.12198,-0.088968,0.002484,1.0,1.0,1.0
fico_high,0.083221,0.074036,-0.709035,-0.218322,0.12198,-0.088968,0.002484,1.0,1.0,1.0
fico_avg,0.083221,0.074036,-0.709035,-0.218322,0.12198,-0.088968,0.002484,1.0,1.0,1.0


### c)

In [109]:
loan_data.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length,fico_low,fico_high,fico_avg
0,20000.0,20000.0,0.089,36 months,debt_consolidation,14.9,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year,735.0,739.0,737.0
1,19200.0,19200.0,0.1212,36 months,debt_consolidation,28.36,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years,715.0,719.0,717.0
2,35000.0,35000.0,0.2198,60 months,debt_consolidation,23.81,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years,690.0,694.0,692.0
3,10000.0,9975.0,0.0999,36 months,debt_consolidation,14.3,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years,695.0,699.0,697.0
4,12000.0,12000.0,0.1171,36 months,credit_card,18.78,NJ,RENT,3195.0,695-699,11.0,14469.0,0.0,9 years,695.0,699.0,697.0


In [122]:
loan_data['Loan.Purpose'].value_counts()

debt_consolidation    1307
credit_card            444
other                  201
home_improvement       152
major_purchase         101
small_business          87
car                     50
wedding                 39
medical                 30
moving                  29
vacation                21
house                   20
educational             15
renewable_energy         4
Name: Loan.Purpose, dtype: int64

In [133]:
dcir = loan_data.loc[loan_data['Loan.Purpose'] == 'debt_consolidation' , 'Interest.Rate'].dropna()
ccir = loan_data.loc[loan_data['Loan.Purpose'] == 'credit_card' , 'Interest.Rate'].dropna()
oir = loan_data.loc[loan_data['Loan.Purpose'] == 'other' , 'Interest.Rate'].dropna()
hiir = loan_data.loc[loan_data['Loan.Purpose'] == 'home_improvement' , 'Interest.Rate'].dropna()
mpir = loan_data.loc[loan_data['Loan.Purpose'] == 'major_purchase' , 'Interest.Rate'].dropna()
sbir = loan_data.loc[loan_data['Loan.Purpose'] == 'small_business' , 'Interest.Rate'].dropna()
cir = loan_data.loc[loan_data['Loan.Purpose'] == 'car' , 'Interest.Rate'].dropna()
wir = loan_data.loc[loan_data['Loan.Purpose'] == 'wedding' , 'Interest.Rate'].dropna()
mir = loan_data.loc[loan_data['Loan.Purpose'] == 'medical' , 'Interest.Rate'].dropna()
moir = loan_data.loc[loan_data['Loan.Purpose'] == 'moving' , 'Interest.Rate'].dropna()
vir = loan_data.loc[loan_data['Loan.Purpose'] == 'vacation' , 'Interest.Rate'].dropna()
hir = loan_data.loc[loan_data['Loan.Purpose'] == 'house' , 'Interest.Rate'].dropna()
eir = loan_data.loc[loan_data['Loan.Purpose'] == 'educational' , 'Interest.Rate'].dropna()
reir = loan_data.loc[loan_data['Loan.Purpose'] == 'renewable_energy' , 'Interest.Rate'].dropna()

In [134]:
print('Mean for debt_consolidation',round(dcir.mean(),2))
print('Mean for credit_card',round(ccir.mean(),2))
print('Mean for other',round(oir.mean(),2))
print('Mean for home_improvement',round(hiir.mean(),2))
print('Mean for major_purchase',round(mpir.mean(),2))
print('Mean for small_business',round(sbir.mean(),2))
print('Mean for car',round(cir.mean(),2))
print('Mean for wedding',round(wir.mean(),2))
print('Mean for medical',round(mir.mean(),2))
print('Mean for moving',round(moir.mean(),2))
print('Mean for vacation',round(vir.mean(),2))
print('Mean for house',round(hir.mean(),2))
print('Mean for educational',round(eir.mean(),2))
print('Mean for renewable_energy',round(reir.mean(),2))

Mean for debt_consolidation 0.14
Mean for credit_card 0.13
Mean for other 0.13
Mean for home_improvement 0.12
Mean for major_purchase 0.11
Mean for small_business 0.13
Mean for car 0.11
Mean for wedding 0.12
Mean for medical 0.12
Mean for moving 0.14
Mean for vacation 0.12
Mean for house 0.13
Mean for educational 0.11
Mean for renewable_energy 0.1


In [135]:
stats.f_oneway(dcir , ccir , oir , hiir , mpir , sbir , cir , wir , mir , moir , vir , hir , eir , reir)

F_onewayResult(statistic=7.481404351302257, pvalue=1.1686298123677374e-14)

- f val: 7.48

- Significance Level: 0.05 (5%)
- P value: 1.1686298123677374e-14 (very low)
- p val < signi value
- Accepting the Ha
- HO: 0.14 != 0.13 != 0.13 != 0.12 != 0.11 != 0.13 != 0.11 != 0.12 != 0.12 != 0.14 != 0.12 != 0.13 != 0.11 != 0.1
<br>

- **Analysis** : There is a statistically significant difference between purpose of loans.

### d)

In [153]:
loan_data['Home.Ownership'].value_counts()

MORTGAGE    1147
RENT        1146
OWN          200
OTHER          5
NONE           1
Name: Home.Ownership, dtype: int64

In [158]:
fm = loan_data.loc[loan_data['Home.Ownership'] == 'MORTGAGE' , 'fico_avg'].dropna()
fr = loan_data.loc[loan_data['Home.Ownership'] == 'RENT' , 'fico_avg'].dropna()
fow = loan_data.loc[loan_data['Home.Ownership'] == 'OWN' , 'fico_avg'].dropna()
fot = loan_data.loc[loan_data['Home.Ownership'] == 'OTHER' , 'fico_avg'].dropna()
fn = loan_data.loc[loan_data['Home.Ownership'] == 'NONE' , 'fico_avg'].dropna()

In [159]:
print('Mean for Mortgage',round(fm.mean(),2))
print('Mean for Rent',round(fr.mean(),2))
print('Mean for Own',round(fow.mean(),2))
print('Mean for Other',round(fot.mean(),2))
print('Mean for None',round(fn.mean(),2))

Mean for Mortgage 713.7
Mean for Rent 702.06
Mean for Own 708.55
Mean for Other 674.0
Mean for None 802.0


In [160]:
stats.f_oneway(fm , fr , fow , fot , fn)

F_onewayResult(statistic=19.334591709631045, pvalue=1.114378111228758e-15)

- f val: 19.33

- Significance Level: 0.05 (5%)
- P value: 1.114378111228758e-15 (very low)
- p val < signi value
- Accepting the Ha
- HO: 713.7 != 702.06 != 708.55 != 674.0 != 802.0
<br>

- **Analysis** : There is a statistically significant difference in fico scores for home ownership.

### Business Problem - 2

#### EDA

In [63]:
price_quotes = pd.read_csv('D:/python/proj_7/Price_Quotes.csv')

In [64]:
price_quotes.head(12)

Unnamed: 0,Order_Number,Barry_Price,Mary_Price
0,1,126,114
1,2,110,118
2,3,138,114
3,4,142,111
4,5,146,129
5,6,136,119
6,7,94,97
7,8,103,104
8,9,140,127
9,10,152,133


In [65]:
price_quotes.dtypes

Order_Number    int64
Barry_Price     int64
Mary_Price      int64
dtype: object

In [66]:
mary_avg_price = price_quotes['Mary_Price'].mean()

In [67]:
barry_avg_price = price_quotes['Barry_Price'].mean()

In [68]:
print(mary_avg_price)
print(barry_avg_price)

114.75
124.33333333333333


#### Dependent(paired) T test

In [69]:
stats.ttest_rel(a=price_quotes.Barry_Price , b=price_quotes.Mary_Price)

Ttest_relResult(statistic=2.521376510892349, pvalue=0.02840588045242053)

- t val: 2.52

- Significance Level: 0.05 (5%)
- P value: 0.028 (HIGH)
- p val < signi value
- Accepting the Ha
- HO: 114.75 != 124.33
<br>

- **Analysis** : There has been a statistically significant difference in the average price quotes provided by mary and barry.

### Business Problem - 3

#### EDA

In [6]:
treat_fac = pd.read_csv('D:/python/proj_7/Treatment_Facility.csv')

In [7]:
treat_fac.rename(columns={'VAR4' : 'TRFF' , 'VAR5' : 'CI'} , inplace=True)

In [14]:
treat_fac.head()

Unnamed: 0,Month,Reengineer,Employee_Turnover,TRFF,CI
0,1,Prior,0.0,24.390244,42.682927
1,2,Prior,6.0606,19.354839,25.806452
2,3,Prior,12.1212,35.087719,146.19883
3,4,Prior,3.3333,18.404908,110.429448
4,5,Prior,12.9032,17.964072,23.952096


In [45]:
treat_fac.dtypes

Month                  int64
Reengineer            object
Employee_Turnover    float64
TRFF                 float64
CI                   float64
dtype: object

In [33]:
prior_TO = treat_fac.loc[treat_fac.Reengineer == 'Prior' , 'Employee_Turnover']

In [34]:
post_TO = treat_fac.loc[treat_fac.Reengineer == 'Post' , 'Employee_Turnover']

In [40]:
print('The mean of prior_TO is' , round(prior_TO.mean(),2))
print('The mean of post_TO is' , round(post_TO.mean(),2))

The mean of prior_TO is 11.74
The mean of post_TO is 18.69


In [41]:
print('The std deviation of prior_TO is' , round(prior_TO.std(),2))
print('The std deviation of post_TO is' , round(post_TO.std(),2))

The std deviation of prior_TO is 7.04
The std deviation of post_TO is 10.56


#### Re-engineering vs employee turnover

In [42]:
stats.ttest_ind(a=prior_TO , b=post_TO , equal_var=False)

Ttest_indResult(statistic=-1.5653912078421088, pvalue=0.15207128913702453)

- t val: -1.565

- Significance Level: 0.05 (5%)
- P value: 0.152 (HIGH)
- p val > signi value
- Accepting the H0
- H0: 11.74 == 18.69
<br>

- **Analysis** : There is no statistically significant difference between the Re-engineering vs employee turnover.

#### Re-engineering vs critical incidence

In [47]:
prior_ci = treat_fac.loc[treat_fac.Reengineer == 'Prior' , 'CI']

In [48]:
post_ci = treat_fac.loc[treat_fac.Reengineer == 'Post' , 'CI']

In [49]:
print('The mean of prior_ci is' , round(prior_ci.mean(),2))
print('The mean of post_ci is' , round(post_ci.mean(),2))

The mean of prior_ci is 53.89
The mean of post_ci is 23.35


In [50]:
print('The std deviation of prior_ci is' , round(prior_ci.std(),2))
print('The std deviation of post_ci is' , round(post_ci.std(),2))

The std deviation of prior_ci is 48.7
The std deviation of post_ci is 7.81


In [51]:
stats.ttest_ind(a=prior_ci , b=post_ci , equal_var=False)

Ttest_indResult(statistic=2.209006531430452, pvalue=0.045565519570193176)

- t val: 2.209

- Significance Level: 0.05 (5%)
- P value: 0.045 (Low)
- p val < signi value
- Accepting the Ha
- H0: 53.89!=23.35
<br>

- **Analysis** : There has been a statistically significant difference between Re-engineering vs critical incidence

### Business Problem - 4

In [52]:
prior_assess = pd.read_csv('D:/python/proj_7/Priority_Assessment.csv')

In [57]:
prior_assess.head()

Unnamed: 0,Days,Priority
0,3.3,High
1,7.9,Medium
2,0.3,High
3,0.7,Medium
4,8.6,Medium


In [62]:
prior_high = prior_assess.loc[prior_assess.Priority == 'High' , 'Days']

In [63]:
prior_medium = prior_assess.loc[prior_assess.Priority == 'Medium' , 'Days']

In [64]:
prior_low = prior_assess.loc[prior_assess.Priority == 'Low' , 'Days']

In [66]:
print('The mean of prior_high is' , round(prior_high.mean(),2))
print('The mean of prior_medium is' , round(prior_medium.mean(),2))
print('The mean of prior_low is' , round(prior_low.mean(),2))

The mean of prior_high is 3.02
The mean of prior_medium is 2.5
The mean of prior_low is 4.23


In [67]:
stats.f_oneway(prior_high , prior_medium , prior_low)

F_onewayResult(statistic=1.812311010076072, pvalue=0.16411459461716182)

- f val: 1.812

- Significance Level: 0.05 (5%)
- P value: 0.164 (High)
- p val > signi value
- Accepting the HO
- HO: 3.02 = 2.5 = 4.23
<br>

- **Analysis** : There is no statistically significant difference in number of days for high, medium and low priority jobs. The system is working fine.

### Business Problem - 5

In [13]:
films = pd.read_csv('D:/python/proj_7/Films.csv')

In [14]:
films.head()

Unnamed: 0,_rowstate_,Movie,Gender,Marital_Status,Sinage,Parking,Clean,Overall,Age,Income,Hear_About
0,0,Ferris Buellers Day Off,Female,Married,2.0,2.0,2.0,2.0,3.0,1.0,5
1,0,Ferris Buellers Day Off,Female,Single,1.0,1.0,1.0,1.0,2.0,1.0,5
2,0,Ferris Buellers Day Off,Male,Married,2.0,4.0,3.0,2.0,4.0,1.0,5
3,0,Ferris Buellers Day Off,Female,Married,1.0,3.0,2.0,2.0,4.0,1.0,5
4,0,Ferris Buellers Day Off,Female,Married,1.0,1.0,1.0,1.0,3.0,3.0,1


### a)

In [15]:
films.Age.value_counts()

2.0    175
3.0    117
1.0     26
4.0     10
Name: Age, dtype: int64

In [16]:
films.dtypes

_rowstate_          int64
Movie              object
Gender             object
Marital_Status     object
Sinage            float64
Parking           float64
Clean             float64
Overall           float64
Age               float64
Income            float64
Hear_About         object
dtype: object

In [77]:
round(films['Overall'].mean(), 0)

2.0

- The overall satisfaction by customer is good.
- We will not do any hypothesis tetsing in the above part , simply we'll calculate the mean to find
  the overall satisfaction of the customer.

### b)

In [36]:
fico = films.corr()

In [74]:
films.corr(method='pearson')

Unnamed: 0,_rowstate_,Sinage,Parking,Clean,Overall,Age,Income
_rowstate_,,,,,,,
Sinage,,1.0,0.470322,0.349163,0.382881,-0.091512,-0.036721
Parking,,0.470322,1.0,0.444368,0.516585,-0.023607,-0.022084
Clean,,0.349163,0.444368,1.0,0.349412,-0.005052,0.019851
Overall,,0.382881,0.516585,0.349412,1.0,-0.034171,-0.00405
Age,,-0.091512,-0.023607,-0.005052,-0.034171,1.0,0.144772
Income,,-0.036721,-0.022084,0.019851,-0.00405,0.144772,1.0


- The above pearson correlation method is showing us that the variables related to satisfation are :- Parking , Sinage , Clean

### b)

In [48]:
films.Movie.value_counts()

Willy Wonka                161
Ferris Buellers Day Off    137
Old School                  32
Name: Movie, dtype: int64

In [75]:
films.loc[: , ['Movie' , 'Sinage' , 'Parking']].value_counts()

Movie                    Sinage  Parking
Willy Wonka              2.0     2.0        57
Ferris Buellers Day Off  2.0     2.0        45
Willy Wonka              1.0     1.0        41
Ferris Buellers Day Off  1.0     1.0        36
Willy Wonka              2.0     1.0        16
Ferris Buellers Day Off  1.0     2.0        14
                         2.0     1.0        12
Old School               1.0     1.0        10
Willy Wonka              3.0     2.0         9
                                 3.0         8
                         1.0     2.0         7
Ferris Buellers Day Off  3.0     2.0         6
Old School               2.0     2.0         6
Ferris Buellers Day Off  3.0     1.0         6
                         2.0     3.0         6
Old School               2.0     1.0         6
Willy Wonka              2.0     3.0         5
                         3.0     1.0         4
Ferris Buellers Day Off  3.0     3.0         3
                                 4.0         3
                   