In [1]:
# Data Analysis and Modeling
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split

# Imports for Implementing Poisson Model
from patsy import dmatrices
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Packages for PostgreSQL Import
import psycopg2

In [2]:
# Ideally I'll move this into the project config.py file
# Otherwise for now I have to just manually assign . . .

dbname = "freelance_db"
username = "Metaverse"
pswd = "Arcifice91"

In [3]:
# Connect to Data (from: scraping_data.ipynb)

con = None
con = psycopg2.connect(database = dbname, user = username, host='localhost', password=pswd)

# Extract freelance_db as fl_table, don't bring Punjab obs
sql_query = """SELECT * FROM analysis_table;"""
analysis_dt = pd.read_sql_query(sql_query, con)
analysis_dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   index               948 non-null    int64  
 1   invoices_per_month  948 non-null    float64
 2   Region              948 non-null    object 
 3   hourly_rate         948 non-null    int64  
 4   less_five_skills    948 non-null    int64  
 5   bio_length          948 non-null    int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 44.6+ KB


### Poisson Model

1. See: https://towardsdatascience.com/an-illustrated-guide-to-the-poisson-regression-model-50cccba15958

In [6]:
# Model setup using patsy
mask = np.random.rand(len(analysis_dt)) < 0.8
df_train = analysis_dt[mask]
df_test = analysis_dt[~mask]
print('Training data set length='+str(len(df_train)))
print('Testing data set length='+str(len(df_test)))

expr = """invoices_per_month ~ hourly_rate + less_five_skills + bio_length"""

y_train, X_train = dmatrices(expr, df_train, return_type='dataframe')
y_test, X_test = dmatrices(expr, df_test, return_type='dataframe')


poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.summary())



Training data set length=744
Testing data set length=204
                 Generalized Linear Model Regression Results                  
Dep. Variable:     invoices_per_month   No. Observations:                  744
Model:                            GLM   Df Residuals:                      740
Model Family:                 Poisson   Df Model:                            3
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1299.1
Date:                Tue, 02 Jun 2020   Deviance:                       1571.3
Time:                        16:29:11   Pearson chi2:                 4.04e+03
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------

In [8]:
poisson_predictions = poisson_training_results.get_prediction(X_test) 
predictions_summary_frame = poisson_predictions.summary_frame()
print(predictions_summary_frame)

         mean   mean_se  mean_ci_lower  mean_ci_upper
1    1.777168  0.126970       1.544950       2.044291
6    1.440822  0.096754       1.263137       1.643503
15   0.787909  0.047287       0.700471       0.886261
16   1.907684  0.165756       1.608965       2.261863
22   1.144187  0.055888       1.039727       1.259141
..        ...       ...            ...            ...
931  0.758668  0.047392       0.671243       0.857480
934  0.866280  0.044652       0.783040       0.958368
935  0.646814  0.075222       0.514977       0.812402
937  0.740662  0.082568       0.595290       0.921534
944  0.788709  0.089640       0.631211       0.985506

[204 rows x 4 columns]


In [13]:
def squared_error(truth,predict):
    return (truth-predict)^2

In [37]:
y_test.iloc[1,0]

21.17032967032967

In [39]:
# Calculating Mean Square Error
se = []
for i in range(0,len(y_test)):
    se[i] = squared_error(y_test.iloc[i,0],predictions_summary_frame['mean'].iloc[i])
se.mean()

TypeError: ufunc 'bitwise_xor' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

### Trying something else

In [None]:
# Training Testing Splits
y = analysis_dt.invoices_per_month
X_train, X_test, y_train, y_test = train_test_split(analysis_dt, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)