In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import linear_model
import statsmodels.api as sm
from sqlalchemy import create_engine

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'weatherinszeged'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

weather_df = pd.read_sql_query('select * from weatherinszeged',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

In [2]:
# Y is the target variable
Y = weather_df['apparenttemperature'] - weather_df['temperature']

# X is the feature set
X = weather_df[['humidity','windspeed']]

# We add constant to the model as it's a best practice
# to do so every time!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(Y, X).fit()

# We print the summary results.
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.288
Model:                            OLS   Adj. R-squared:                  0.288
Method:                 Least Squares   F-statistic:                 1.949e+04
Date:                Mon, 20 Jan 2020   Prob (F-statistic):               0.00
Time:                        21:09:48   Log-Likelihood:            -1.7046e+05
No. Observations:               96453   AIC:                         3.409e+05
Df Residuals:                   96450   BIC:                         3.409e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.4381      0.021    115.948      0.0

In [None]:
#r-sqaured = .288 and Adjusted r-squared = .288
# This is unsatisfactory because it's very low and gives a lot of room to improve the model

In [3]:
#interaction 
# Y is the target variable
Y = weather_df['apparenttemperature'] - weather_df['temperature']

weather_df["humidity"] = weather_df.humidity * weather_df.windspeed

# X is the feature set
X = weather_df[['humidity','windspeed']]

# We add a constant to the model as it's a best practice
# to do so every time!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(Y, X).fit()

# We print the summary results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.341
Model:                            OLS   Adj. R-squared:                  0.341
Method:                 Least Squares   F-statistic:                 2.497e+04
Date:                Mon, 20 Jan 2020   Prob (F-statistic):               0.00
Time:                        21:11:15   Log-Likelihood:            -1.6670e+05
No. Observations:               96453   AIC:                         3.334e+05
Df Residuals:                   96450   BIC:                         3.334e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2178      0.008     26.385      0.0

In [None]:
#The new r-squared and adjusted r-squared have barely risen, still low given they're below the .50 mark
#This doesn't improve the model by much

In [4]:
# Y is the target variable
Y = weather_df['apparenttemperature'] - weather_df['temperature']

# X is the feature set with visibility
X = weather_df[['humidity','windspeed','visibility']]

# We add constant to the model as it's a best practice
# to do so every time!
X = sm.add_constant(X)

# We fit an OLS model using statsmodels
results = sm.OLS(Y, X).fit()

# We print the summary results.
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                 1.815e+04
Date:                Mon, 20 Jan 2020   Prob (F-statistic):               0.00
Time:                        21:12:45   Log-Likelihood:            -1.6524e+05
No. Observations:               96453   AIC:                         3.305e+05
Df Residuals:                   96449   BIC:                         3.305e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.3657      0.013    -27.184      0.0

In [None]:
#This is the best model by far because the r-squared and adjusted r-sqaured 
#values, because it's the highest among the three at 0.361; and the AIC and BIC numbers are the lowest