In [131]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
import pandas as pd


csv_file = 'Actual-values-unnormalised-no-encoding.csv'

In [132]:
csv_file = pd.read_csv(csv_file, delimiter=',')

In [133]:
true_output = csv_file[['Euclid Distance']].to_numpy()
prepos_dummies = pd.get_dummies(csv_file['preposition']).to_numpy()
latlong = csv_file[['LongF', 'LongG']].to_numpy()

latlong_mean = np.mean(latlong, axis=0, keepdims=True)
latlong_stdev = np.std(latlong, axis=0, keepdims=True)
latlong_norm = (latlong - latlong_mean) / latlong_stdev

true_output_mean = np.mean(true_output, axis=0, keepdims=True)
true_output_stdev = np.std(true_output, axis=0, keepdims=True)
true_output_norm = (true_output - true_output_mean) / true_output_stdev

data_norm = np.concatenate([latlong_norm, prepos_dummies], axis=1)
data_natural = np.concatenate([latlong, prepos_dummies], axis=1)

In [152]:
pd.get_dummies(csv_file['preposition']).columns.to_list()

['above',
 'across',
 'adjacent to',
 'along',
 'alongside',
 'around',
 'at',
 'behind',
 'beside',
 'beyond',
 'by',
 'close to',
 'in',
 'inside',
 'near',
 'next to',
 'off',
 'on',
 'opposite',
 'outside',
 'over',
 'past',
 'through',
 'towards']

In [None]:
from sklearn.linear_model import LinearRegression
model_norm=LinearRegression().fit(data_norm, true_output_norm)
predicted_norm = model_norm.predict(data_norm)
predicted_denormed = predicted_norm * true_output_stdev + true_output_mean

In [None]:
model_natural=LinearRegression().fit(data_natural, true_output)
predicted_natural = model_natural.predict(data_natural)

In [None]:
fig = plt.figure()
ax = plt.axes()
ax.plot(true_output);
ax.plot(predicted_natural);

In [None]:
fig = plt.figure()
ax = plt.axes()
ax.plot(true_output);
ax.plot(predicted_denormed);

In [None]:
pd.DataFrame(data = np.concatenate([true_output, predicted_denormed], axis = 1), columns=['true', 'predicted_denorm'])

In [None]:
pd.DataFrame(data = np.concatenate([true_output, predicted_natural], axis = 1), columns=['true', 'predicted_natural'])

In [None]:
from sklearn.neural_network import MLPRegressor
clf = MLPRegressor(alpha=1e-6, hidden_layer_sizes=(20, 5), random_state=1, max_iter=2000)
clf = clf.fit(data_norm, true_output_norm.ravel())
predicted_norm = clf.predict(data_norm).reshape((-1,1))
predicted_denormed = predicted_norm * true_output_stdev + true_output_mean
pd.DataFrame(data = np.concatenate([true_output, predicted_denormed], axis = 1), columns=['true', 'predicted_denorm'])

In [None]:
fig = plt.figure()
ax = plt.axes()
ax.plot(true_output);
ax.plot(predicted_denormed);

In [None]:
clf = MLPRegressor(alpha=1e-3, hidden_layer_sizes=(50, 5), random_state=1, max_iter=2000)
clf = clf.fit(data_norm, true_output.ravel())
predicted = clf.predict(data_norm).reshape((predicted.size,-1))
pd.DataFrame(data = np.concatenate([true_output, predicted], axis = 1), columns=['true', 'predicted'])

In [21]:
import statsmodels.api as sm
from scipy import stats


X2 = sm.add_constant(data_norm)
est = sm.OLS(true_output_norm, X2)
est2 = est.fit()


headers = csv_file.columns.to_list()
dummy_headers = pd.get_dummies(csv_file['preposition']).columns.to_list()
xname = np.concatenate([['const', 'LatF', 'LongF', 'LatG', 'LongG'], dummy_headers]);

print(est2.summary(xname=xname.tolist()))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.381
Model:                            OLS   Adj. R-squared:                  0.356
Method:                 Least Squares   F-statistic:                     15.32
Date:                Sun, 26 Apr 2020   Prob (F-statistic):           1.74e-53
Time:                        16:02:33   Log-Likelihood:                -825.35
No. Observations:                 700   AIC:                             1707.
Df Residuals:                     672   BIC:                             1834.
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.1203      0.056      2.144      

In [None]:
pd.get_dummies(csv_file['preposition'])

In [4]:
from util import onehot_by_group

In [43]:
groups = (
    ('next to', 'near', 'adjacent to', 'outside'),
    ('beside', 'close to'),
    ('by', 'past', 'off', 'beyond'),
    ('along', 'alongside'),
    ('opposite', 'behind', 'above', 'on', 'at'),
    ('in', 'inside'),
    ('around',),
    ('towards',),
    ('through', 'over', 'across')
)
group_names = ('Approximate 1', 'Approximate 2', 'Far', 'Alongness', 'Facing', 'In', 'Around', 'Towards', 'Throughness')


# groups = (
#     ('next to', 'near', 'adjacent to', 'outside'),
#     ('beside', 'close to'),
#     ('by', 'past', 'off', 'beyond'),
#     ('along', 'alongside'),
#     ('opposite', 'behind', 'above'),
#     ('in', 'inside'),
#     ('around', 'towards', 'at', 'on'),
#     ('through', 'over', 'across')
# )
# group_names = ('Approximate 1', 'Approximate 2', 'Far', 'Alongness', 'Facing', 'In', '?', 'Throughness')


In [44]:
raw_prepositions = csv_file['preposition'].to_numpy()
result = onehot_by_group(raw_prepositions, groups, group_names)

In [45]:
dummy_column_names = list(result.keys())
dummy_column_values = np.ndarray((len(raw_prepositions), len(groups)), dtype=np.int32)

In [47]:
for idx, column in enumerate(result.values()):
    dummy_column_values[:, idx] = column

In [17]:
data_norm = np.concatenate([latlong_norm, dummy_column_values], axis=1)

In [22]:
import statsmodels.api as sm
from scipy import stats


X2 = sm.add_constant(latlong_norm)
est = sm.OLS(true_output_norm, X2)
est2 = est.fit()


headers = csv_file.columns.to_list()
dummy_headers = pd.get_dummies(csv_file['preposition']).columns.to_list()
xname = np.concatenate([['const', 'LatF', 'LongF', 'LatG', 'LongG']]);

print(est2.summary(xname=xname.tolist()))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.335
Model:                            OLS   Adj. R-squared:                  0.331
Method:                 Least Squares   F-statistic:                     87.35
Date:                Sun, 26 Apr 2020   Prob (F-statistic):           3.99e-60
Time:                        16:02:43   Log-Likelihood:                -850.70
No. Observations:                 700   AIC:                             1711.
Df Residuals:                     695   BIC:                             1734.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.188e-13      0.031  -3.84e-12      1.0

In [143]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2)
poly_latlong_norm = poly.fit_transform(latlong_norm)

est = sm.OLS(true_output_norm**2, poly_latlong_norm)
est2 = est.fit()

headers = csv_file.columns.to_list()
dummy_headers = pd.get_dummies(csv_file['preposition']).columns.to_list()
xname = poly.get_feature_names(['LongF', 'LongG']);

print(est2.summary(xname=xname))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.982
Model:                            OLS   Adj. R-squared:                  0.982
Method:                 Least Squares   F-statistic:                     7716.
Date:                Sun, 26 Apr 2020   Prob (F-statistic):               0.00
Time:                        17:46:20   Log-Likelihood:                -752.38
No. Observations:                 700   AIC:                             1517.
Df Residuals:                     694   BIC:                             1544.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
1               0.1879      0.039      4.853      

In [141]:
# predicted_norm_squared = est2.predict(poly_latlong_norm).reshape((-1, 1))
# predicted_norm = np.sqrt(predicted_norm_squared)
# predicted_denormed = predicted_norm * true_output_stdev + true_output_mean
# pd.DataFrame(data = np.concatenate([true_output, predicted_norm_squared], axis = 1), columns=['true', 'predicted'])

# fig = plt.figure()
# ax = plt.axes()
# ax.plot(true_output);
# ax.plot(predicted_denormed);

['1', 'LongF', 'LongG', 'LongF^2', 'LongF LongG', 'LongG^2']

In [146]:
pd.DataFrame(data=true_output_norm**2)

Unnamed: 0,0
0,0.399213
1,0.384025
2,0.381375
3,0.380772
4,0.377711
...,...
695,0.000097
696,0.006251
697,0.240564
698,0.289722


In [109]:
predicted_norm, np.sqrt(predicted_norm)

  """Entry point for launching an IPython kernel.


(array([[ 1.57666094e-01],
        [ 1.57330876e-01],
        [ 1.54545916e-01],
        [ 1.51940432e-01],
        [ 1.46951574e-01],
        [ 1.36853387e-01],
        [ 1.59164799e-01],
        [ 1.81614790e-01],
        [ 1.79609979e-01],
        [ 1.50668027e-01],
        [ 1.81270135e-01],
        [ 1.68059442e-01],
        [ 1.66518917e-01],
        [ 1.38480342e-01],
        [ 1.34014030e-01],
        [ 1.38217584e-01],
        [ 1.38217584e-01],
        [ 1.49809251e-01],
        [ 1.88691431e-01],
        [ 1.32945853e-01],
        [ 1.35188963e-01],
        [ 2.19307441e-01],
        [ 1.88374363e-01],
        [ 1.59237665e-01],
        [ 1.40721555e-01],
        [ 1.51900101e-01],
        [ 1.88749580e-01],
        [ 1.79305457e-01],
        [ 2.31979235e-01],
        [ 1.86866018e-01],
        [ 1.29105496e-01],
        [ 1.31519353e-01],
        [ 2.08501579e-01],
        [ 2.38324598e-01],
        [ 1.18873962e-01],
        [ 1.80220614e-01],
        [ 1.39260580e-01],
 

In [50]:
dummy_column_values.shape

(700, 9)

In [84]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

poly = PolynomialFeatures(degree=2)
poly_latlong_norm = poly.fit_transform(latlong_norm)

data_norm = np.concatenate([poly_latlong_norm, dummy_column_values], axis=1)

X2 = sm.add_constant(data_norm)
est = sm.OLS(true_output_norm**2, X2)
est2 = est.fit()

poly_var_names = []
for i in range(1, poly_latlong_norm.shape[1] + 1):
    poly_var_names.append('Poly Var {}'.format(i))

xname = np.concatenate([np.array(poly_var_names), np.array(group_names)]).tolist();

print(est2.summary(xname=xname))

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.147
Model:                            OLS   Adj. R-squared:                  0.131
Method:                 Least Squares   F-statistic:                     9.076
Date:                Sun, 26 Apr 2020   Prob (F-statistic):           2.11e-17
Time:                        16:43:21   Log-Likelihood:                -2109.4
No. Observations:                 700   AIC:                             4247.
Df Residuals:                     686   BIC:                             4311.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Poly Var 1        0.9579      0.298      3.218

In [67]:
xname = np.concatenate([np.array(poly_var_names), np.array(group_names)]);

In [77]:
dummy_column_values.shape

(700, 9)