In [1]:
import pandas as pd
from collections import OrderedDict
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support,auc
from matplotlib.colors import ListedColormap
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from random import seed,randrange


In [2]:
df= pd.read_csv('wvs.csv.bz2', delimiter = '\t')
print(df.shape)
print("We have 328 variables and 90350 responses")
df.head(10)


(90350, 328)
We have 328 variables and 90350 responses


Unnamed: 0,V2,V4,V5,V6,V7,V8,V9,V10,V11,V12,...,MN_228S8,MN_229A,MN_230A,MN_233A,MN_237B1,MN_249A1,MN_249A3,I_RELIGBEL,I_NORM1,I_VOICE1
0,12,1,1,1,-2,1,1,2,1,1,...,3,-3,-3,-3,-3,1,1,0.0,1.0,0.0
1,12,1,2,3,4,2,2,2,2,2,...,3,-3,-3,-3,-3,2,-1,0.0,1.0,0.66
2,12,1,3,2,4,2,1,2,2,2,...,4,1,1,2,-3,1,1,0.0,1.0,0.33
3,12,1,1,3,4,3,1,2,1,2,...,2,2,1,2,-3,1,2,0.0,1.0,0.0
4,12,1,1,1,2,1,1,1,3,2,...,2,2,1,2,-3,1,2,0.0,1.0,0.66
5,12,1,2,2,2,4,1,2,1,2,...,3,2,1,1,-3,1,2,0.0,1.0,0.0
6,12,1,1,1,1,1,1,2,2,1,...,3,2,2,2,-3,1,1,0.0,1.0,0.66
7,12,1,1,1,1,2,2,2,1,2,...,3,1,1,2,-3,2,2,0.0,1.0,0.0
8,12,1,1,1,2,2,2,2,2,2,...,3,2,1,1,-3,-3,-3,0.0,1.0,0.33
9,12,1,1,1,2,1,1,1,1,2,...,3,-3,-3,-3,0,-3,-3,0.0,1.0,0.66


In [3]:
df_v204 = df[['V204']].copy()
df_v204.tail(5)
df_v204.describe()


Unnamed: 0,V204
count,90350.0
mean,2.946386
std,2.96404
min,-5.0
25%,1.0
50%,2.0
75%,5.0
max,10.0


In [4]:
def count(df,column):
    print (sum(df[column].pct_change().fillna(0) > 0))
    
count(df_v204, 'V204')
len(df_v204)
print("Thus we see V204 has 24896 postive responses - non missing values out of the 90350 responses")

24896
Thus we see V204 has 24896 postive responses - non missing values out of the 90350 responses


In [5]:
print("Reponse 1",df_v204.loc[df_v204.V204 == 1].count())
print("Reponse 2",df_v204.loc[df_v204.V204 == 2].count())
print("Reponse 3",df_v204.loc[df_v204.V204 == 3].count())
print("Reponse 4",df_v204.loc[df_v204.V204 == 4].count())
print("Reponse 5",df_v204.loc[df_v204.V204 == 5].count())
print("Reponse 6",df_v204.loc[df_v204.V204 == 6].count())
print("Reponse 7",df_v204.loc[df_v204.V204 == 7].count())
print("Reponse 8",df_v204.loc[df_v204.V204 == 8].count())
print("Reponse 9",df_v204.loc[df_v204.V204 == 9].count())
print("Reponse 10",df_v204.loc[df_v204.V204 == 10].count())

print("From the responses we can see that the global pool of respondents believe that abortion is never justifiable")


Reponse 1 V204    40227
dtype: int64
Reponse 2 V204    7896
dtype: int64
Reponse 3 V204    6294
dtype: int64
Reponse 4 V204    4497
dtype: int64
Reponse 5 V204    9580
dtype: int64
Reponse 6 V204    4395
dtype: int64
Reponse 7 V204    3493
dtype: int64
Reponse 8 V204    3397
dtype: int64
Reponse 9 V204    1896
dtype: int64
Reponse 10 V204    4067
dtype: int64
From the responses we can see that the global pool of respondents believe that abortion is never justifiable


In [6]:
#Dropping non-positive values
df = df[(df['V204'] >= 0) | (df['V204'].isnull())]
df = df[(df['V2'] >= 0) | (df['V2'].isnull())]
print(df.head(5))
s = df.min(axis=0)
s[s < 0]
#Dropping missing values
df_final = df.dropna() 

   V2  V4  V5  V6  V7  V8  V9  V10  V11  V12  ...  MN_228S8  MN_229A  MN_230A  \
0  12   1   1   1  -2   1   1    2    1    1  ...         3       -3       -3   
1  12   1   2   3   4   2   2    2    2    2  ...         3       -3       -3   
2  12   1   3   2   4   2   1    2    2    2  ...         4        1        1   
3  12   1   1   3   4   3   1    2    1    2  ...         2        2        1   
4  12   1   1   1   2   1   1    1    3    2  ...         2        2        1   

   MN_233A  MN_237B1  MN_249A1  MN_249A3  I_RELIGBEL  I_NORM1  I_VOICE1  
0       -3        -3         1         1         0.0      1.0      0.00  
1       -3        -3         2        -1         0.0      1.0      0.66  
2        2        -3         1         1         0.0      1.0      0.33  
3        2        -3         1         2         0.0      1.0      0.00  
4        2        -3         1         2         0.0      1.0      0.66  

[5 rows x 328 columns]


In [7]:
print(df_final.shape)
print("The final number of observations is 79267")


(79267, 328)
The final number of observations is 79267


In [8]:
def f(row):
    if row['V204'] > 3:
        val = 1
    else:
        val = 0
    return val
df_final['Abortion'] = df.apply(f, axis=1)
#list(df_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [9]:
import numpy
pc = df_final[df_final.columns[1:]].corr()['Abortion'][:-1]
pc.sort_values(ascending = False) 
#Divorce, Homosexuality, Sex before marriage, Suicide have strong correlation with abortion. They represnt the possible psychological effects of having an abortion

V204    0.881048
V205    0.548653
V203    0.485419
V206    0.446394
V207    0.418271
          ...   
V138   -0.142894
V255   -0.149844
V223   -0.165924
V252   -0.191483
V152   -0.315280
Name: Abortion, Length: 327, dtype: float64

In [10]:
df_final=df_final.rename(columns = {'V2':'Country'})
list(df_final)

['Country',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V44_ES',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V56_NZ',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V74B',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 'V97',
 'V98',
 'V99',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121

In [11]:
data = df_final.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
data2 = pd.get_dummies(data, columns = ['Country'])
data2 = data2.drop('V204', 1)
print(data2.head(5))


   V4  V5  V6  V7  V8  V9  V10  V11  V12  V13  ...  Country_752  Country_764  \
0   1   1   1  -2   1   1    2    1    1    1  ...            0            0   
1   1   2   3   4   2   2    2    2    2    1  ...            0            0   
2   1   3   2   4   2   1    2    2    2    2  ...            0            0   
3   1   1   3   4   3   1    2    1    2    2  ...            0            0   
4   1   1   1   2   1   1    1    3    2    1  ...            0            0   

   Country_780  Country_788  Country_792  Country_804  Country_840  \
0            0            0            0            0            0   
1            0            0            0            0            0   
2            0            0            0            0            0   
3            0            0            0            0            0   
4            0            0            0            0            0   

   Country_858  Country_860  Country_887  
0            0            0            0  
1           

In [12]:
dff= data2.loc[:, ~data2.columns.str.startswith('Country')]
#dff = dff.drop('V204', 1)
print(dff)
print("we now have",dff.shape,"rows and columns")
list(dff)

       V4  V5  V6  V7  V8  V9  V10  V11  V12  V13  ...  MN_229A  MN_230A  \
0       1   1   1  -2   1   1    2    1    1    1  ...       -3       -3   
1       1   2   3   4   2   2    2    2    2    1  ...       -3       -3   
2       1   3   2   4   2   1    2    2    2    2  ...        1        1   
3       1   1   3   4   3   1    2    1    2    2  ...        2        1   
4       1   1   1   2   1   1    1    3    2    1  ...        2        1   
...    ..  ..  ..  ..  ..  ..  ...  ...  ...  ...  ...      ...      ...   
90345   1   3   2   4   1   1    3    3    2    1  ...       -4       -4   
90346   1   1   1   3   1   1    3    1    2    1  ...       -4       -4   
90347   1   2   1   3   1   1    3    3    2    1  ...       -4       -4   
90348   1   2   2   3   1   1    2    2    2    1  ...       -4       -4   
90349   1   2   2   2   2   1    2    2    2    1  ...       -4       -4   

       MN_233A  MN_237B1  MN_249A1  MN_249A3  I_RELIGBEL  I_NORM1  I_VOICE1  \
0       

['V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V44_ES',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V56_NZ',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',
 'V67',
 'V68',
 'V69',
 'V70',
 'V71',
 'V72',
 'V73',
 'V74',
 'V74B',
 'V75',
 'V76',
 'V77',
 'V78',
 'V79',
 'V80',
 'V81',
 'V82',
 'V83',
 'V84',
 'V85',
 'V86',
 'V87',
 'V88',
 'V89',
 'V90',
 'V91',
 'V92',
 'V93',
 'V94',
 'V95',
 'V96',
 'V97',
 'V98',
 'V99',
 'V100',
 'V101',
 'V102',
 'V103',
 'V104',
 'V105',
 'V106',
 'V107',
 'V108',
 'V109',
 'V110',
 'V111',
 'V112',
 'V113',
 'V114',
 'V115',
 'V116',
 'V117',
 'V118',
 'V119',
 'V120',
 'V121',
 'V122',


In [13]:
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.model_selection import KFold

In [14]:
# define the cross validation function
def cross_validation(df,k):
    np.random.seed(5)                                            # set seed
    indices = np.arange(len(df))                                 # get the range of indices
    np.random.shuffle(df.index.values)                       # shuffle the index values
    for i in range(k):       
    
        itest = indices[i::k]                                    # select the indices for test set
        itrain = list(set(indices).difference(itest))            # select the indices for train set
        
        test_set = df.iloc[itest]                                # define the test set
        train_set = df.iloc[itrain]      
        x = df.iloc[:, :-1].values
        y = df.iloc[:, 1].values

        #spliting the dataset into training and test set
        x_train, x_test, y_train, y_test = train_test_split(x, y, 
        test_size=0.2, random_state=0)
    
    lin_reg = linear_model.LinearRegression()
    lin_model_cv = cross_val_score(lin_reg,x,y,cv=5)
    lin_model_cv
    print("RSquare: %0.2f (+/- %0.2f)" % (lin_model_cv.mean(), lin_model_cv.std() * 2))    
    

    return lin_model_cv.mean(), (lin_model_cv.std() * 2)
    

In [15]:
cross_validation(dff,5)

RSquare: 1.00 (+/- 0.00)


(0.9999568407287989, 0.00014283477834422045)

In [23]:
X = dff.loc[:, dff.columns != 'Abortion']
y = dff['Abortion']

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5,algorithm='kd_tree',n_jobs=10)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.8082959075541202


In [24]:
knn = KNeighborsClassifier(n_neighbors=5,algorithm='kd_tree',n_jobs=10)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
print(scores)

[0.68668012 0.70711403 0.76889113 0.73975022 0.64425382 0.50845319
 0.2768105  0.37761797 0.70817562 0.71284381]


In [25]:
knn = KNeighborsClassifier(n_neighbors=20,algorithm='kd_tree',n_jobs=10)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

0.6377612170091367


In [221]:
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())



0.9998864687562781


In [None]:
X = dff.loc[:, dff.columns != 'Abortion']
y = dff['Abortion']

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, n_jobs = 10)
print(accuracies.mean())
print(accuracies.std())

In [237]:
print("When we compare the three models in terms of accuracy we see that the logistical regression model out performs teh svm and knn models. The least accurate is the KNN.")
print("However it is observed that svm takes the longest computation time")
print("In terms of speed both the Knn and logistical regression models performed far better than the svm which was the slowest")
print("The best prediction is  observed in the logistical model")

When we compare the three models in terms of accuracy we see that the logistical regression model out performs teh svm and knn models. The least accurate is the KNN.
However it is observed that svm has a better f-score among the three but also takes the longest computation time
In terms of speed both the Knn and logistical regression models performed far better than the svm which was the slowest
A case of overfitting was also observed in the logistical model


In [238]:
print("Given our data set which has multiple factors and a high correlation among then I would do the analysis again with the logistical model")

Given our data set which has multiple factors and a high correlation among then I would do the analysis again with the svm model


In [None]:
#Trying svm once again
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score

X = dff.loc[:, dff.columns != 'Abortion']
y = dff['Abortion']

X.shape, y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

X_train.shape, y_train.shape

X_test.shape, y_test.shape


clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)

clf.score(X_test, y_test)                          

clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X, y, cv=5, n_jobs = 10)
scores

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
print("Thus we see that logistical regression has the best results in terms of prediction, accuracy and time. Thus we use it to check the significance of country as well")

In [26]:
X = dff.loc[:, dff.columns != 'Abortion']
y = dff['Abortion']

# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())



0.6921259633972854


In [16]:
X = data2.loc[:, data2.columns != 'Abortion']
y = data2['Abortion']
# use train/test split with different random_state values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
# 10-fold cross-validation with logistic regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())



0.5679060191991215


In [None]:
#Thus we can see that the country variable does not considerbaly affect the prediction of abortion and it would be okay to not factor it into our analysis.