In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline
from sqlalchemy import create_engine
import statsmodels.api as sm
import seaborn as sns
import scipy.stats as stats

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'weatherinszeged'

In [2]:
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
df = pd.read_sql_query('select * from weatherinszeged',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()


df.head()

Unnamed: 0,date,summary,preciptype,temperature,apparenttemperature,humidity,windspeed,windbearing,visibility,loudcover,pressure,dailysummary
0,2006-03-31 22:00:00+00:00,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-03-31 23:00:00+00:00,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 00:00:00+00:00,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 01:00:00+00:00,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 02:00:00+00:00,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [3]:
#allow us to see all columns and rows w/out truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
df['temp_dif'] = df['apparenttemperature'] - df['temperature']

In [5]:
X = df[['humidity', 'windspeed']]
y = df['temp_dif']

In [6]:
lrm  = linear_model.LinearRegression()

lrm.fit(X, y)

print('Coefficients: \n', lrm.coef_)
print('\nIntercept: \n', lrm.intercept_)

Coefficients: 
 [-3.02918594 -0.11929075]

Intercept: 
 2.4381054151876933


In [7]:
#always add your constant!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(y, X).fit()

# We print the summary results.
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               temp_dif   R-squared:                       0.288
Model:                            OLS   Adj. R-squared:                  0.288
Method:                 Least Squares   F-statistic:                 1.949e+04
Date:                Fri, 12 Jul 2019   Prob (F-statistic):               0.00
Time:                        10:08:59   Log-Likelihood:            -1.7046e+05
No. Observations:               96453   AIC:                         3.409e+05
Df Residuals:                   96450   BIC:                         3.409e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.4381      0.021    115.948      0.0

Ok, well these seem to be significant and the coefficients make sense to me. You could expect an increase in either humidity and windspeed to decrease overall temperature. This r-squared is rather small though...interesting!

## Capturing interaction

In [8]:
df['humid_ws'] = df['humidity'] * df['windspeed']

In [9]:
X = df[['humidity', 'windspeed', 'humid_ws']]
y = df['temp_dif']

In [10]:
#always add your constant!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(y, X).fit()

# We print the summary results.
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:               temp_dif   R-squared:                       0.341
Model:                            OLS   Adj. R-squared:                  0.341
Method:                 Least Squares   F-statistic:                 1.666e+04
Date:                Fri, 12 Jul 2019   Prob (F-statistic):               0.00
Time:                        10:14:23   Log-Likelihood:            -1.6669e+05
No. Observations:               96453   AIC:                         3.334e+05
Df Residuals:                   96449   BIC:                         3.334e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0839      0.033      2.511      0.0

Our coefficients are still statistically significant, but our coefficients have changed! could be interpreted as the fact that humidity and windspeed combined will decrease our overall temperature. This makes sense as an increase in windspeed and humidity could be a sign of an incoming storm which will often drop the temperature.