In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import datetime

In [3]:
def featurize(df, label):
  # Set label and features
  y = df[label]
  X = df.astype(float).assign(const=1)  # drop all categorical features and allow y-intercept to vary
  X = X.drop(columns=[label])
  return y, X

def mlr(df_pre_copy, label, print_results=True):
  df = df_pre_copy.copy()
  
  # Set label and features
  y = df[label]
  X = df.astype(float).assign(const=1)  # drop all categorical features and allow y-intercept to vary
  X = X.drop(columns=[label])

  # Run the multiple linear regression model
  model = sm.OLS(y, X)
  results = model.fit()

  # View results
  if(print_results): print(results.summary())
  return results

def results_summary(df, results, print_results=True):

  # Calculate Values
  r2 = results.rsquared
  r2_adj = results.rsquared_adj
  rmse = np.sqrt(sum((results.resid**2))/len(results.resid))
  mae = np.mean(abs(results.resid))
  mean = df['retweets'].mean()

  # Print Values
  if print_results:
    print(r2)
    print(r2_adj)
    print(rmse)
    print(mae)
    print(mean)
  
  return r2, r2_adj, rmse, mae, mean

In [4]:
def vif(df_pre_copy, label):
  df = df_pre_copy.copy()

  import pandas as pd
  from sklearn.linear_model import LinearRegression

  df.drop(inplace=True, columns=[label])

  # initialize dictionaries
  vif_dict, tolerance_dict = {}, {}

  try: df.drop(columns=['const'], inplace=True)
  except: pass

  # form input data for each exogenous variable
  for col in df:
    y = df[col]
    X = df.drop(columns=[col])
    
    # extract r-squared from the fit
    r_squared = LinearRegression().fit(X, y).score(X, y)

    # calculate VIF
    if r_squared < 1: # Prevent division by zero runtime error
      vif = 1/(1 - r_squared) 
    else:
      vif = 100
    vif_dict[col] = vif

    # calculate tolerance
    tolerance = 1 - r_squared
    tolerance_dict[col] = tolerance

    # generate the DataFrame to return
    df_output = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict})

  return df_output.sort_values(by=['VIF'], ascending=False)

In [5]:
def remove_high_p(df, label, r2_diff_max=.01, print_results=False):
  r2_results = []
  df_copy = df.copy()  

  #Initial Test
  r2_results.append(mlr(df_copy, label, False))
  results = r2_results[-1]
  

  while (results.rsquared - results.rsquared_adj > r2_diff_max):
    drop_col = r2_results[-1].pvalues.sort_values(ascending=False).index[0]
    df_copy.drop(inplace=True, columns=[drop_col])
    r2_results.append(mlr(df_copy, label, False))
    results = r2_results[-1]

  df_copy[label] = df[label]
  return df_copy, r2_results

In [9]:
df = pd.read_csv("noReplies-clean.csv", index_col=0)

In [10]:
for i in df:
    print(f"{i}: {type(df[i][0])}")

Unnamed: 0: <class 'numpy.int64'>
Unnamed: 0.1: <class 'numpy.int64'>
created_at: <class 'numpy.int64'>
num_referenced_tweets: <class 'numpy.int64'>
topic_1: <class 'numpy.float64'>
topic_2: <class 'numpy.float64'>
topic_3: <class 'numpy.float64'>
topic_4: <class 'numpy.float64'>
num_hashtags_ln: <class 'numpy.float64'>
num_mentions_ln: <class 'numpy.float64'>
num_cashtags_ln: <class 'numpy.float64'>
num_polls_ln: <class 'numpy.float64'>
like_count_ln: <class 'numpy.float64'>
lang_other: <class 'numpy.int64'>
possibly_sensitive_True: <class 'numpy.int64'>
reply_settings_following: <class 'numpy.int64'>
reply_settings_mentionedUsers: <class 'numpy.int64'>
source_Khoros Publishing: <class 'numpy.int64'>
source_Sprinklr: <class 'numpy.int64'>
source_Sprout Social: <class 'numpy.int64'>
source_Twitter Web App: <class 'numpy.int64'>
source_Twitter Web Client: <class 'numpy.int64'>
source_Twitter for iPhone: <class 'numpy.int64'>
source_other: <class 'numpy.int64'>
is_retweet_True: <class 'n

In [11]:
results = mlr(df, "like_count_ln")
df, results_list = remove_high_p(df, "like_count_ln")

                            OLS Regression Results                            
Dep. Variable:          like_count_ln   R-squared:                       0.319
Model:                            OLS   Adj. R-squared:                  0.319
Method:                 Least Squares   F-statistic:                     1876.
Date:                Fri, 17 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:54:47   Log-Likelihood:            -1.9612e+05
No. Observations:               99993   AIC:                         3.923e+05
Df Residuals:                   99967   BIC:                         3.925e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Unnamed: 0    

In [13]:
results = mlr(df, "like_count_ln")

                            OLS Regression Results                            
Dep. Variable:          like_count_ln   R-squared:                       0.319
Model:                            OLS   Adj. R-squared:                  0.319
Method:                 Least Squares   F-statistic:                     1876.
Date:                Fri, 17 Dec 2021   Prob (F-statistic):               0.00
Time:                        18:55:06   Log-Likelihood:            -1.9612e+05
No. Observations:               99993   AIC:                         3.923e+05
Df Residuals:                   99967   BIC:                         3.925e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
Unnamed: 0    