In [46]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression # Linear Regression Model
from sklearn.preprocessing import StandardScaler #Z-score variables
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score #model evaluation

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv


%matplotlib inline

In [47]:
#Load data

trainData = pd.read_csv('train_data.csv')
testData = pd.read_csv('test_data.csv')

trainData.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,0,0.0,0.833333,9/1/14,237.0,29.02,31.64,29.57,30.73,29.71,...,-27.68,-37.21,8.32,9.56,-2.03,48.13,28.09,-13.5,11.9,4.58
1,1,0.0,0.833333,9/2/14,228.9,29.02,31.64,29.57,30.73,29.71,...,-21.13,-36.57,8.77,21.17,4.44,48.6,27.41,-23.77,15.44,3.42
2,2,0.0,0.833333,9/3/14,220.69,29.02,31.64,29.57,30.73,29.71,...,-10.72,-34.16,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82
3,3,0.0,0.833333,9/4/14,225.28,29.02,31.64,29.57,30.73,29.71,...,0.33,-31.04,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74
4,4,0.0,0.833333,9/5/14,237.24,29.02,31.64,29.57,30.73,29.71,...,9.83,-31.8,7.47,38.62,-5.21,54.73,-2.58,-42.3,21.91,10.95


## **LINEAR REGRESSION**

In [48]:
#Split the train data
preds = ['contest-pevpr-sfc-gauss-14d__pevpr', 'nmme0-tmp2m-34w__cancm30', 'nmme0-tmp2m-34w__ccsm40']

#59
target = ['contest-tmp2m-14d__tmp2m']



X_train, X_test, y_train, y_test = train_test_split(trainData[preds], trainData[target], test_size = 0.2,
                                                    random_state = 10)

In [49]:
#Create and fit model
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [50]:
#Predictions
y_pred = model.predict(X_test)

In [51]:
#Training R2
print('Train R2: ', model.score(X_train, y_train))

#Testing R2
print('Test R2: ', model.score(X_test, y_test))

Train R2:  0.7897158234625546
Test R2:  0.7909013371531577


In [52]:
#Training MSE
print(mean_squared_error(y_train, model.predict(X_train)))

#Testing MSE
print(mean_squared_error(y_test, y_pred))

20.50359585716284
20.25310116264779


In [53]:
#Predict on test data
finalPreds = model.predict(testData[preds])
finalPreds

array([[22.71421718],
       [22.59868615],
       [22.66910507],
       ...,
       [ 6.40604297],
       [ 6.62192096],
       [ 6.76187855]])

In [60]:
#Create Results Dataframe

results = pd.DataFrame(finalPreds, columns = ['contest-tmp2m-14d__tmp2m'])

results['index'] = testData['index']

results

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,22.714217,375734
1,22.598686,375735
2,22.669105,375736
3,22.844712,375737
4,23.099541,375738
...,...,...
31349,6.245840,407083
31350,6.481083,407084
31351,6.406043,407085
31352,6.621921,407086


In [61]:
results.to_csv('practice_solution.csv', index = False)

In [None]:
#True vs. Pred
true_vs_pred = pd.DataFrame({"Predicted": y_pred,
                             "True": y_test})

true_vs_pred

In [None]:
#Plot
(ggplot(true_vs_pred, aes(x = 'True', y = 'Predicted')) + geom_point())

## **DIMENSIONALITY REDUCTION**

In [None]:
from plotnine import *
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import numpy as np

In [None]:
#Grab the numeric columns of interest
features = trainData.columns[5:247]

In [None]:
#Create and fit PCA model
pca = PCA()
pca.fit(trainData[features])

ValueError: ignored

In [None]:
pcaDF = pd.DataFrame({'expl_var': pca.explained_variance_ratio_,
                      'pc': range(1,242),
                      'cum_var': pca.explained_variance_ratio_.cumsum()})
pcaDF.head()

AttributeError: ignored

In [None]:
(ggplot(pcaDF, aes(x = "pc", y = "expl_var")) + geom_line() + geom_point())


NameError: ignored

In [None]:
(ggplot(pcaDF, aes(x = "pc", y = "cum_var")) + geom_line(color = "pink") +
 geom_point(color = "pink") + geom_hline(yintercept = 0.95))