# Data Loading and Exploration

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_data = pd.read_csv('/Users/moukaii/Downloads/Data/Correctness/Correctness_Train.csv', index_col=0)
test_data = pd.read_csv('/Users/moukaii/Downloads/Data/Correctness/Correctness_Test.csv', index_col=0)
print('Train:{}   Test:{}'.format(train_data.shape, test_data.shape))
train_data.head().iloc[:,0:7]
#Only part of the data is shown below

In [None]:
train_data.describe().iloc[:,0:5]
#Only part of the data is shown below

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
train_data.hist(bins = 60, figsize = (20,20))
plt.show()

In [None]:
import seaborn as sns
plt.figure(figsize = (25,20))
sns.heatmap(train_data.corr(), fmt = '.1f', cmap = 'coolwarm', annot = True)
plt.show()

In [None]:
fig = plt.figure(figsize = (14,8))
abs(train_data.corr()['correctness']).sort_values(ascending = False).plot.bar()
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.show()

In [None]:
from scipy import stats
from scipy.stats import norm
dis = sns.displot(train_data['correctness'], kde = True)
dislog = sns.displot(np.log1p(train_data['correctness']), kde = True)

In [None]:
from scipy import stats
from scipy.stats import norm
trainVar = np.log1p(train_data.pop('correctness'))
testVar = test_data.pop('correctness')
#trainVar_means=trainVar.mean()
#trainVar_std=trainVar.std()
#trainVar= (trainVar - trainVar_means)/trainVar_std
#trainVar=1.0 / (1.0 + np.exp(trainVar))
trainVar.head()

In [None]:
from scipy import stats
from scipy.stats import norm
dis = sns.displot(trainVar, kde = True)

# Data Processing and Feature Engineering

Firstly, transforming some numerical variables that are really categorical.

In [None]:
combined_data = pd.concat([train_data,test_data], axis = 0)
combined_data.head()
combined_data.shape

## Standardization

In [None]:
numericVar = combined_data.columns[combined_data.dtypes != 'object']
numericVar_means = combined_data.loc[:,numericVar].mean()
numericVar_std = combined_data.loc[:,numericVar].std()
combined_data.loc[:,numericVar] = (combined_data.loc[:,numericVar] - numericVar_means)/numericVar_std
combined_data.head()
#Only part of the data is shown below

In [None]:
new_train_data = combined_data.loc[train_data.index]
new_test_data = combined_data.loc[test_data.index]
print('New Train:{}   New Test:{}'.format(new_train_data.shape, new_test_data.shape))

# Model Establishment and Training

## Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, KFold, train_test_split
array_train = new_train_data.values
array_test = new_test_data.values

Cross validation is added to prevent model overfitting

In [None]:
lambda_k = np.logspace(1.5,2.5,100)
test_scores = []
n_folds = 100
for k in lambda_k:
    clf = Ridge(k)
    kf = KFold(n_folds, shuffle=True, random_state=40).get_n_splits(array_train)
    test_score = np.sqrt(-cross_val_score(clf,array_train,trainVar,cv=kf,scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))

In [None]:
plt.plot(lambda_k,test_scores)
plt.title('Ridge Parameter - Cross Validation Error')

By observing the score, the optimal value of the parameter can be determined.

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfpall = [.1,.2,.25,.3,.35,.4,.99]
test_scores = []
for rfp in rfpall:
    clf = RandomForestRegressor(n_estimators = 400,max_features = rfp)
    test_score = np.sqrt(-cross_val_score(clf,array_train,trainVar,cv = 5,scoring = 'neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(rfpall,test_scores)
plt.title('Max Features - Error')
plt.show()

By observing the score, the optimal value of the parameter can be determined.

## Averaging Predictions


In [None]:
ridge = Ridge(alpha = 220)
rf = RandomForestRegressor(n_estimators = 400,max_features = .4)
ridge.fit(array_train,trainVar)
rf.fit(array_train,trainVar)

y_ridge = np.expm1(ridge.predict(array_test))
y_rf = np.expm1(rf.predict(array_test))

y_hat = (y_ridge + y_rf) / 2

In [None]:
results = pd.DataFrame(data = {'Id':test_data.index,'Correctness':y_hat})
results.head()
#Only part of the data is shown below

In [None]:
accuracy_model=np.corrcoef(results['Correctness'], testVar)
accuracy_model