In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
from datetime import date
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import sklearn as sk
import six
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 16, 9

In [2]:
data = pd.read_csv('cleaned_data.csv')

# Multicollinearity checking

#### Based on research we drop one of dummy variables for each group (Gender, Year of birth, Level of education) of dummy created variables. This aproach is based on seperating one referent variable, usually the one with most rows.

In [3]:
variables = data.drop(['course_id', 'userid_DI', 'grade', 'Unknown/Other', 'UnknownLoE','u', 'o', 'm', 'YoB' ], axis=1)
#droping:
# descriptive columns that are not considered as variables like course id
# grade
# u and o columns because they have no remining values
# one referent variable for each group of dummy variables

In [4]:
correlations = variables.corr()

In [5]:
w, v = np.linalg.eig(correlations) #compute the eigenvalues and right eigenvectors
# w is array of eigenvalues
# v is array of normalized eigenvectors

In [6]:
df = pd.DataFrame(columns= ['variables', 'w'])

In [7]:
df['variables'] = correlations.columns
df['w'] = w

In [8]:
df

Unnamed: 0,variables,w
0,viewed,3.918071
1,explored,1.930139
2,nevents,0.039928
3,ndays_act,0.124346
4,nplay_video,0.164388
5,nchapters,0.154892
6,nforum_posts,1.604177
7,Australia,0.322393
8,Bangladesh,0.444625
9,Brazil,1.394412


We see that eigenvalues are closest to 0 for variables nevents, ndays_active, nplay_videos and nchapters. This could possibly be explained by saying that all of them measure some types of interaction with course materials so it makes sense that they are more strongly correlated. 
#### Based on research of how to deal with multicollinearity we have decided to remove two of them from final set of variables. This will solve the problem of existance of multicollinearity.

## Multicollinearity checking without dummy variables
We were curious too see the result in this special case and decided to leave also this check for final version.

In [9]:
colums = ['viewed', 'explored', 'nevents', 'ndays_act',
       'nplay_video', 'nchapters', 'nforum_posts', 'total_activity',
       'age']

In [10]:
n_c = data[colums].corr()

In [11]:
w, v = np.linalg.eig(n_c)

In [12]:
df1 = pd.DataFrame(columns= ['variables', 'w'])

In [13]:
df1['variables'] = n_c.columns
df1['w'] = w

In [14]:
df1

Unnamed: 0,variables,w
0,viewed,3.888165
1,explored,1.088961
2,nevents,1.01683
3,ndays_act,0.953093
4,nplay_video,0.768979
5,nchapters,0.669203
6,nforum_posts,0.164721
7,total_activity,0.127928
8,age,0.322121


Lowest eigenvalue is for column total_activity.

#### The good confirmation of our analysis is the fact that correlation analysis in previous notebook and the multicollinearity analysis preforemed here both point out that variables that measure interaction with courses materials are mostly correlated with each other and that this needs to be handeld so it wouldn't affect our model.

# Linear regression

In [15]:
excluded = ['course_id', 'o', 'u','userid_DI', 'final_cc_cname_DI', 'start_time_DI', 'last_event_DI', 'LoE_DI','YoB',
            'gender','grade'] 

In [16]:
regression_data = data.drop(excluded, axis=1) # excluding all columns that are not suitable variables for linear regression

In [17]:
cols = ['nchapters', 'total_activity', 'nplay_video', 'age']

In [18]:
X, y = regression_data.drop(cols, axis=1), data['grade']
lm = LinearRegression()
lm.fit(X, y)
predictions = lm.predict(X)
print('R2:', metrics.r2_score(y, predictions))

R2: 0.643208031565


In [19]:
list(zip(X.columns, lm.coef_))

[('viewed', -0.0053201229265095488),
 ('explored', 0.21071638682158703),
 ('nevents', 3.1782418661613354e-05),
 ('ndays_act', 0.0043903907823268952),
 ('nforum_posts', 0.002345151503285875),
 ('Australia', 0.0034774716558332447),
 ('Bangladesh', -0.0041994188117098208),
 ('Brazil', -0.0012788803019587076),
 ('Canada', -0.0010852917403241097),
 ('China', 0.0016254575812153439),
 ('Colombia', 0.0022640324516981988),
 ('Egypt', -0.0058316256612901789),
 ('France', 0.0061879213226447641),
 ('Germany', 0.0093188423357371472),
 ('Greece', 0.0099002730280920281),
 ('India', 0.0049259586229265101),
 ('Indonesia', 0.0012359100392574617),
 ('Japan', -0.0030246601598731236),
 ('Mexico', -0.0052756872941620986),
 ('Morocco', -0.0018892680005211313),
 ('Nigeria', -0.0008764970783809228),
 ('Other Africa', -0.00061780273542454481),
 ('Other East Asia', 0.00051793148126120722),
 ('Other Europe', -0.0016476523896827196),
 ('Other Middle East/Central Asia', -0.0014388180887866808),
 ('Other North & Cen

In [20]:
lm.intercept_

-0.0031258695978068629

In [21]:
sk.feature_selection.f_regression(X,y)

(array([  1.42910243e+04,   4.55802468e+05,   4.97600346e+05,
          5.78924714e+05,   6.32284474e+03,   7.01051029e+00,
          2.92469920e+01,   4.59315834e+00,   8.41254238e-02,
          1.78342543e+01,   6.55812122e+01,   6.06763948e+01,
          5.25778870e+01,   3.04879535e+02,   2.03863286e+02,
          8.20236054e+02,   2.48426537e+00,   9.18758886e+00,
          5.19709491e-01,   7.44791258e+01,   8.25988808e+00,
          6.69071353e+00,   8.16783415e+00,   6.05204827e+02,
          3.05147218e+01,   3.12032613e+01,   5.96058558e+00,
          1.19740756e+01,   2.80663387e+01,   6.33126359e+01,
          1.47376537e+01,   5.18131273e+02,   7.46517186e+01,
          3.97762143e+02,   1.41666112e+03,   1.11301297e+02,
          8.42582031e+01,   3.04030812e+02,   3.59024777e+03,
          1.55988088e+02,   5.92512337e+01,   3.65799551e+00,
          9.02287669e+02,   1.19106918e+02,   2.46799377e+02,
          2.91292857e+02,   2.91292857e+02]),
 array([  0.00000000e+00