In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor
import scipy.stats as stats
import itertools


  from pandas.core import datetools


In [4]:
df = pd.read_csv("USvideos.csv")
data = pd.read_csv("mjollnir.csv")
df.shape
data.shape

(281, 22)

In [71]:
def get_features_combo(features):
    print("Generating list_of_features")
    list_of_features = []
    for L in range(0, len(features)+1):    
        for subset in itertools.combinations(features, L):
            list_of_features.append([s for s in subset])
    print("Got list_of_features")
    return list_of_features 

def get_pvalue(X, y, list_of_features, percentile):
    """returns the p-value at a percentile"""
    "Getting all p-values"
    p_values = []
    for i in list_of_features:
        if (len(i) > 0):
            X_subset = sm.add_constant(X[i])
            model = sm.OLS(y, X_subset).fit()
            residuals = model.resid
            p_value = stats.normaltest(residuals)[1]
            p_values.append(p_value)
    pvalue = np.percentile(p_values, percentile)
    print("Got best p-value")
    return p_value

def get_best_models(X, y, list_of_features, coef_pvalue_threshold = 0.05, res_pvalue_threshold = 0.05):
    print("Getting best models")
    winning_models = {}
    #getting all possible combinations of features 
    for i in list_of_features:
        if (len(i) > 0):
            #adding constant to X
            X_subset = sm.add_constant(X[i])
            #fitting the model 
            model = sm.OLS(y, X_subset).fit()
            # 1st condition
            # Check if p-value of all coefficients is less than 0.05 
            if (sum(model.pvalues < coef_pvalue_threshold) == len(model.pvalues)):
                residuals = model.resid 
                #checking if the residuals are normal, a p-value < 0.05 means that the null hypothesis of normality of
                #residuals is rejected 
                p_value = stats.normaltest(residuals)[1]
                if (p_value > res_pvalue_threshold):
                    # the number of features is 1, there is no vif score 
                    if (len(i) < 2):
                        #storing the feature 
                        winning_features = [s for s in X_subset.columns]
                        #storing Ajd-R2 as key, list of features as values 
                        winning_models[round(model.rsquared_adj, 3)] = winning_features
                    else:
                        #getting vifs if list of features is more than 1
                        vifs = [variance_inflation_factor(X_subset.values, i) for i in range(X_subset.shape[1])]
                        #getting vifs greater than threshold 5
                        above_5_vif = [s for s in vifs if s > 5]
                        # 2nd condition, if no vifs is greater than 5
                        if (len(above_5_vif)) == 0:
                            print("Model passed, getting features")
                            #storing the feature 
                            winning_features = [s for s in X_subset.columns]
                            #storing Ajd-R2 as key, list of features as values 
                            winning_models[round(model.rsquared_adj,3)] = winning_features
    return winning_models

In [None]:
features  = ['Max of  likes', 'Max of Categ 2','Max of Categ 10', 
          'Max of Categ 15', 'Max of Categ 17',
       'Max of Categ 19', 'Max of Categ 20', 'Max of Categ 22',
       'Max of Categ 23', 
       'Max of # tags', 'Max of Views after 1 day',
       'Max of comments after 1 day', 'Max of likes after 1 day']

X = X[features]
y = data['Max of Views after 5 days']
coef_pvalue_threshold = 0.05
list_of_features = get_features_combo(features) 
percentile = 25

res_pvalue_threshold = get_pvalue(X, y, list_of_features, percentile)
winning_models = get_best_models(X, y, list_of_features, coef_pvalue_threshold, res_pvalue_threshold)

Generating list_of_features
Got list_of_features
Got best p-value
Getting best models
Model passed, getting features


In [70]:
winning_models

{0.544: ['const', 'Max of comments after 1 day', 'Max of likes after 1 day']}