### Load packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

### Load datasets

In [14]:
# read data
df = pd.read_csv('../data/df_clean.csv')

In [16]:
# split train and test
last_year = df.Year.max()

In [17]:
df_train_raw = df[df.Year != last_year]
df_test_raw = df[df.Year == last_year]

In [18]:
# get X and y
y_train = df_train_raw['AveTemperature']
X_train = df_train_raw.iloc[:,3:]
y_test = df_test_raw['AveTemperature']
X_test = df_test_raw.iloc[:,3:]

In [153]:
# # save datasets
# y_train.to_csv('../data/y_train.csv')
# X_train.to_csv('../data/X_train.csv')
# y_test.to_csv('../data/y_test.csv')
# X_test.to_csv('../data/X_test.csv')

### Modeling

In [19]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [20]:
# Train the model using the training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [22]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

('Coefficients: \n', array([ -2.31972610e-03,   2.45828947e-03,  -3.38460260e-04,
         7.07086933e-04,  -2.48394920e-04,  -1.17603128e-04,
         5.09192269e-04,   1.02387331e-03,  -8.76486163e-04,
        -4.24237888e-04,   8.82778110e-03,   2.05699777e-03,
        -9.72897473e-04,  -2.15642608e-13,   1.17673296e-08,
         2.13810221e-06,   4.69698509e+00,   4.18734722e-02,
         4.40499477e-02,  -1.00027138e-06]))
Mean squared error: 16.42
Variance score: 0.65


In [23]:
import statsmodels.api as sm
X = sm.add_constant(X_train)
model = sm.OLS(y_train, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,AveTemperature,R-squared:,0.858
Model:,OLS,Adj. R-squared:,0.854
Method:,Least Squares,F-statistic:,203.1
Date:,"Wed, 03 Oct 2018",Prob (F-statistic):,5.04e-256
Time:,13:15:32,Log-Likelihood:,-1562.4
No. Observations:,660,AIC:,3165.0
Df Residuals:,640,BIC:,3255.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.4292,1.184,-7.966,0.000,-11.754,-7.105
CO2,-0.0023,0.000,-7.353,0.000,-0.003,-0.002
CO2prox1000,0.0025,0.000,8.249,0.000,0.002,0.003
CO2prox2000,-0.0003,9.58e-05,-3.533,0.000,-0.001,-0.000
CO2prox3000,0.0007,0.000,5.605,0.000,0.000,0.001
CO2prox4000,-0.0002,6.92e-05,-3.590,0.000,-0.000,-0.000
CO2prox5000,-0.0001,4.53e-05,-2.596,0.010,-0.000,-2.86e-05
CO2prox6000,0.0005,6.81e-05,7.480,0.000,0.000,0.001
CO2prox7000,0.0010,0.000,9.878,0.000,0.001,0.001

0,1,2,3
Omnibus:,45.561,Durbin-Watson:,1.925
Prob(Omnibus):,0.0,Jarque-Bera (JB):,146.148
Skew:,-0.252,Prob(JB):,1.8400000000000002e-32
Kurtosis:,5.25,Cond. No.,1.85e+22
