In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

import statsmodels.api as sm 
from statsmodels.graphics.api import abline_plot
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split

from scipy import stats

In [2]:
khan_data = pd.read_csv('./data/return_user.csv')

In [3]:
khan_data = khan_data.drop(['Unnamed: 0'], axis =1)

In [4]:
khan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31481 entries, 0 to 31480
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   timestamp             31481 non-null  object
 1   user_id               31481 non-null  int64 
 2   session_id            31481 non-null  int64 
 3   country               31481 non-null  object
 4   language              31481 non-null  object
 5   user_registered_flag  31481 non-null  bool  
 6   device_type           31481 non-null  object
 7   KA_app_flag           31481 non-null  bool  
 8   OS                    31481 non-null  object
 9   URI                   26149 non-null  object
 10  conversion            31481 non-null  object
 11  returned_user         31481 non-null  int64 
 12  returner              31481 non-null  object
dtypes: bool(2), int64(3), object(8)
memory usage: 2.7+ MB


In [34]:
khan_data['returner'].describe()

count     31481
unique        2
top         Yes
freq      21790
Name: returner, dtype: object

In [27]:
khan_data.head(10)

Unnamed: 0,timestamp,user_id,session_id,country,language,user_registered_flag,device_type,KA_app_flag,OS,URI,conversion,returned_user,returner
0,2016-02-18 18:05:34.408245 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,,login,1,Yes
1,2016-02-18 18:05:35.156166 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,,homepage_view,1,Yes
2,2016-02-18 18:05:44.033396 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/welcome""",pageview,1,Yes
3,2016-02-18 18:06:39.681943 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/math/cc-third-grade-math""",pageview,1,Yes
4,2016-02-18 18:06:55.040427 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/mission/cc-third-grade-math""",pageview,1,Yes
5,2016-02-18 18:07:06.233494 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/mission/cc-third-grade-math/task/63250174127...",pageview,1,Yes
6,2016-02-18 18:08:23.21072 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/mission/cc-third-grade-math""",pageview,1,Yes
7,2016-02-18 18:08:35.05428 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/math/cc-fifth-grade-math""",pageview,1,Yes
8,2016-02-18 18:08:41.48111 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/mission/cc-fifth-grade-math""",pageview,1,Yes
9,2016-02-18 18:08:46.90117 UTC,461023995001001,7269247775762971847,US,en,True,desktop,False,Windows,"""/mission/cc-fifth-grade-math/task/60413444865...",pageview,1,Yes


## Chi-square test 

Using a Chi-square test to decide whether there is a relationiship between two categorical variables
H₀: The two categorical variables have no relationship
H₁: There is a relationship between two categorical variables



In [41]:
# Generate Dummy varibles for categorical variables -> shouldn't be needed as that is why I make a crosstab
from scipy.stats import chi2_contingency 
import numpy as np

In [42]:
#Create contigency table 
data_crosstab = pd.crosstab(khan_data['returner'], khan_data['language'], margins=True)

In [43]:
#Set Significance level
alpha = 0.05

In [38]:
#Calculation of Chisquare for language
chi_square = 0 
rows = khan_data['returner'].unique()
columns = khan_data['language'].unique()
for i in columns:
    for j in rows:
        O = data_crosstab[i][j]
        E = data_crosstab[i]['Total'] * data_crosstab['Total'][j] / data_crosstab['Total']['Total']
        chi_square += (O-E)**2/E

In [39]:
#Printing the results with the p-value approach 
print("Appraoch 1: The p-value approach to hypothesis testing in the decision rule")
p_value = 1 - stats.chi2.cdf(chi_square, (len(rows)-1)*(len(columns)-1))
conclusion = "Failed to reject the null hypothesis."
if p_value <= alpha:
    conclusion = "Null Hypothesis is rejected."
        
print("chisquare-score is:", chi_square, " and p value is:", p_value)
print(conclusion)

print("\n--------------------------------------------------------------------------------------")

print("Approach 2: The critical value approach to hypothesis testing in the decision rule")
critical_value = stats.chi2.ppf(1-alpha, (len(rows)-1)*(len(columns)-1))
conclusion = "Failed to reject the null hypothesis."
if chi_square > critical_value:
    conclusion = "Null Hypothesis is rejected."
        
print("chisquare-score is:", chi_square, " and critical value is:", critical_value)
print(conclusion)

Appraoch 1: The p-value approach to hypothesis testing in the decision rule
chisquare-score is: 303.42106797957115  and p value is: 0.0
Null Hypothesis is rejected.

--------------------------------------------------------------------------------------
Approach 2: The critical value approach to hypothesis testing in the decision rule
chisquare-score is: 303.42106797957115  and critical value is: 18.307038053275146
Null Hypothesis is rejected.


## Simpiler chi2 Test

### Being a returner correlated with language

In [48]:
chisqt = pd.crosstab(khan_data.returner, khan_data.language, margins=True)
print(chisqt)

language  da  de     en   es  fr   nb  pl   pt  th  tr  zh-HANS    All
returner                                                              
No        19   1   9061  411  11    8  14  136   4  24        2   9691
Yes        0   0  20636  493   0  153  24  484   0   0        0  21790
All       19   1  29697  904  11  161  38  620   4  24        2  31481


In [49]:
from scipy.stats import chi2_contingency 
import numpy as np
value = np.array([chisqt.iloc[0][0:5].values,
                  chisqt.iloc[1][0:5].values])
print(chi2_contingency(value)[0:3])

(160.66359102254185, 1.0534279768462328e-33, 4)


The p-value is 1.05, 160.66 is the statistical value with 4 degrees of freedom, we are unable to reject the null hypothesis, thus, language and being a returner is not correlated

### Being a returner correlated with country - Significant

In [54]:
chisqt1 = pd.crosstab(khan_data.returner, khan_data.country, margins=True)
print(chisqt1)

country   AE  AM  AR  AT   AU  BD  BE   BR  BS  BY  ...  TR  TT  TW  UA  \
returner                                            ...                   
No        39   1   2   4  257   2  19  156   5  12  ...  20   9  14  11   
Yes       21   0   0   2  350  38   0  461   0   0  ...  15  13   0  16   
All       60   1   2   6  607  40  19  617   5  12  ...  35  22  14  27   

country      US  VE  VN   ZA  ZM    All  
returner                                 
No         7212   7   9   45  16   9691  
Yes       14761   0   0  247   0  21790  
All       21973   7   9  292  16  31481  

[3 rows x 78 columns]


In [51]:
value = np.array([chisqt1.iloc[0][0:5].values,
                  chisqt1.iloc[1][0:5].values])
print(chi2_contingency(value)[0:3])

(16.241057773576184, 0.0027121839245656267, 4)


The p-value is 0.003, 16.24 is the statistical value with 4 degrees of freedom, we are able to reject the null hypothesis, thus, country of orgin and being a returner is correlated

### Being a returner correlated with Operating System (OS)

In [55]:
chisqt2 = pd.crosstab(khan_data.returner, khan_data.OS, margins=True)
print(chisqt2)

OS        Android  BlackBerry OS  Chrome OS  Linux  Mac OS X  Other  Ubuntu  \
returner                                                                      
No            449              3       1360     98      1795      7      27   
Yes          1055              0       2379     82      3710      0     412   
All          1504              3       3739    180      5505      7     439   

OS        Windows  Windows Phone   iOS    All  
returner                                       
No           5008              0   944   9691  
Yes         12116             14  2022  21790  
All         17124             14  2966  31481  


In [56]:
value = np.array([chisqt2.iloc[0][0:5].values,
                  chisqt2.iloc[1][0:5].values])
print(chi2_contingency(value)[0:3])

(65.06510802392796, 2.493204735921778e-13, 4)


The p-value is 2.49, 95.07 is the statistical value with 4 degrees of freedom, we are unable to reject the null hypothesis, thus, Operating System(OS) and being a returner is not correlated

### Being a returner correlated with using Khan Academy App

In [57]:
chisqt3 = pd.crosstab(khan_data.returner, khan_data.KA_app_flag, margins=True)
print(chisqt3)

KA_app_flag  False  True    All
returner                       
No            9498   193   9691
Yes          21387   403  21790
All          30885   596  31481


In [58]:
value = np.array([chisqt3.iloc[0][0:5].values,
                  chisqt3.iloc[1][0:5].values])
print(chi2_contingency(value)[0:3])

(0.7288911104843107, 0.6945816512928396, 2)


The p-value is 0.69, 0.73 is the statistical value with 2 degrees of freedom, we are unable to reject the null hypothesis, thus, using the Khan Academy app and being a returner is not correlated

### Being a returner correlated with device used

In [59]:
chisqt4 = pd.crosstab(khan_data.returner, khan_data.device_type, margins=True)
print(chisqt4)

device_type  desktop  phone  tablet  unknown/other    All
returner                                                 
No              8263    724     697              7   9691
Yes            18622   1148    2020              0  21790
All            26885   1872    2717              7  31481


In [60]:
value = np.array([chisqt4.iloc[0][0:5].values,
                  chisqt4.iloc[1][0:5].values])
print(chi2_contingency(value)[0:3])

(104.04690298413593, 1.3519794225204459e-21, 4)


The p-value is 1.35, 104.05 is the statistical value with 4 degrees of freedom, we are unable to reject the null hypothesis, thus, device type and being a returner is not correlated

### Being a returner correlated with being a registered user on Khan Academy - Significant

In [61]:
chisqt5 = pd.crosstab(khan_data.returner, khan_data.user_registered_flag, margins=True)
print(chisqt5)

user_registered_flag  False   True    All
returner                                 
No                     3262   6429   9691
Yes                    2079  19711  21790
All                    5341  26140  31481


In [62]:
value = np.array([chisqt5.iloc[0][0:5].values,
                  chisqt5.iloc[1][0:5].values])
print(chi2_contingency(value)[0:3])

(2769.909687905716, 0.0, 2)


The p-value is 0.0, 2769.91 is the statistical value with 2 degrees of freedom, we are unable to reject the null hypothesis, thus, device type and being a returner is not correlated