## Loading Data

In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
import numpy as np



In [2]:
dfFinal = pd.read_csv('News_Final.csv')
dfFb_Ec = pd.read_csv('Facebook_Economy.csv')
dfFb_Ms = pd.read_csv('Facebook_Microsoft.csv')
dfFb_Ob = pd.read_csv('Facebook_Obama.csv')
dfFb_Pa = pd.read_csv('Facebook_Palestine.csv')
dfGo_Pa = pd.read_csv('GooglePlus_Palestine.csv')
dfGo_Ob = pd.read_csv('GooglePlus_Obama.csv')
dfGo_Ms = pd.read_csv('GooglePlus_Microsoft.csv')
dfGo_Ec = pd.read_csv('GooglePlus_Economy.csv')
dfLd_Pa = pd.read_csv('LinkedIn_Palestine.csv')
dfLd_Ob = pd.read_csv('LinkedIn_Obama.csv')
dfLd_Ms = pd.read_csv('LinkedIn_Microsoft.csv')
dfLd_Ec = pd.read_csv('LinkedIn_Economy.csv')

In [10]:
#removing the items which was never shared or with share counts = 0
df = dfFinal[(dfFinal.Facebook != -1) & (dfFinal.GooglePlus != -1) & (dfFinal.LinkedIn != -1) & (dfFinal.Facebook != 0) & (dfFinal.GooglePlus != 0) & (dfFinal.LinkedIn != 0)]


In [14]:
#Edit original Data frame to assign group rank
def assignRank(row,rankDictionary):
    if row.IDLink in rankDictionary:
        return rankDictionary[row.IDLink]
    else:
        return 1

In [20]:


def rank_model(df):
    #removing time from publish date
    df['PublishDate'] = df['PublishDate'].astype('datetime64[ns]')
    df['DateOnly'] = df['PublishDate'].dt.date

    #count number of days since Jan 1 1970. These columns would be used to form groups of 3 days
    df['DaysSince1'] = (pd.to_datetime(df['DateOnly']) - pd.datetime(1970,1,1)).dt.days
    df['DaysSince1'] = df['DaysSince1'].subtract(df['DaysSince1'].min()).add(1)
    df['DaysSince2'] = df['DaysSince1'] + 1
    df['DaysSince3'] = df['DaysSince1'] + 2

    print "Calculating rank..."
    # Calculate rank of the news item within the 3 day window group

    loop_start = df['DaysSince1'].min()
    loop_end = df['DaysSince1'].max()
    print loop_start
    print loop_end
    rankDictionary = {}
    for x in range(loop_start, loop_end):
        df_temp = df[(df.DaysSince1 == x) | (df.DaysSince2 == x) | (df.DaysSince3 == x)]
        df_temp.sort_values(by='Facebook')# Made it ascending so that we rank them in reverse order #, ascending=False)
        df_temp['GroupRanking'] = df_temp['Facebook'].rank(ascending=False)
        for index, row in df_temp.iterrows():
            if row['DaysSince1'] == x:
                rankDictionary[row['IDLink']] = row['GroupRanking']
    df['groupRank'] = df.apply (lambda row: assignRank (row,rankDictionary),axis=1)
    df['Facebook'] = df['Facebook'].apply(lambda x: np.log(x + 1))
    #Split DataFrame into X and Y
    #df_X_Without_Rank = pd.get_dummies(df[['Source', 'Topic', 'SentimentTitle', 'SentimentHeadline']])
    df_X = pd.get_dummies(df[['Source', 'Topic', 'SentimentTitle', 'SentimentHeadline', 'groupRank']])
    df_Y = df[['Facebook']]
    xtrain, xtest, ytrain, ytest = train_test_split(df_X, df_Y, test_size=0.25)
    return df, xtrain, ytrain,xtest, ytest
    

In [21]:
def basic_model(df):
    df['Facebook'] = df['Facebook'].apply(lambda x: np.log(x + 1))
    #Split DataFrame into X and Y

    #df_X_Without_Rank = pd.get_dummies(df[['Source', 'Topic', 'SentimentTitle', 'SentimentHeadline']])
    df_X = pd.get_dummies(df[['Source', 'Topic', 'SentimentTitle', 'SentimentHeadline']])
    df_Y = df[['Facebook']]
    xtrain, xtest, ytrain, ytest = train_test_split(df_X, df_Y, test_size=0.25)
    return df, xtrain, ytrain,xtest, ytest

In [19]:
clf = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:

models= [basic_model(df),rank_model(df)]
model_names = ["Basic Model","Model_With_Ranking"]

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge


from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsRegressor
from sklearn import linear_model


ModelEvaluationResults = {}




results = {}

for (m,models_) in enumerate(models):
    df,train_X,train_Y,test_X,test_Y = models_
    print train_X.shape
    print train_Y.shape
    
    print "Evalutaion of Model:  "+str(model_names[m])
    
    clfs = [ 
        
    SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
    ,linear_model.SGDRegressor(),
    linear_model.BayesianRidge(),
    linear_model.LassoLars(),
    linear_model.ARDRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.TheilSenRegressor()]
    #KernelRidge(alpha=1.0, coef0=1, degree=3, gamma=None, kernel='linear',kernel_params=None)]

    #clf_names = ['Logistic Regression','KNeighborsRegressor','SVR','KernelRidge']
    clf_names = ['SVR','SGDRegressor','BayesianRidge','LassoLars','ARDRegression','PassiveAggressiveRegressor','TheilSenRegressor'
                ]

    
    ClassifierEvaluationResults = {}
    
    for (i, clf_) in enumerate(clfs):
        #clf = clf_.fit(train_X, train_Y)
        #preds = clf_.predict(Xtest)
        print "********************************************************************"
        print "Training validations evaluations for : "+str(clf_names[i])
        scores = cross_val_score(clf_, train_X, train_Y, cv=5)
        predicted_ratings = cross_val_predict(clf_, test_X,test_Y, cv=5)
        print scores
        train_mean_score=scores.mean()
        print "Testing validations evaluations for : "+str(clf_names[i])


        r2 = r2_score(test_Y, predicted_ratings, multioutput='uniform_average')
        mae = median_absolute_error(test_Y, predicted_ratings)
        msle =mean_squared_log_error(test_Y, predicted_ratings) 
        mse = mean_squared_error(test_Y, predicted_ratings)
        mae = mean_absolute_error(test_Y, predicted_ratings)
        evs = explained_variance_score(test_Y, predicted_ratings)  
        
        data ={'training score':train_mean_score,
            'r2 score':r2,
            'median absolute error':mae,
            'mean squared log error':msle,
            'mean sqaured error':mse,
            'mean absolute error':mae,
            'explained variance error':evs}

        #data ={'training_score':train_mean_score}        
        
        ClassifierEvaluationResults[clf_names[i]] = data
        print data
    
    ModelEvaluationResults[model_names[m]]=ClassifierEvaluationResults
    print ClassifierEvaluationResults
    print "********************************************************************"

(17496, 1738)
(17496, 1)
Evalutaion of Model:  Basic Model
********************************************************************
Training validations evaluations for : SVR
[-1.82808506 -2.0903415  -1.83809987 -1.97612465 -1.78782276]
Testing validations evaluations for : SVR
{'r2 score': -1.9850734085895656, 'mean absolute error': 0.01681096577526437, 'mean sqaured error': 0.0003322430453001734, 'training score': -1.904094769466904, 'median absolute error': 0.01681096577526437, 'explained variance error': -4.194283299430168e-05, 'mean squared log error': 0.00023086160414568642}
********************************************************************
Training validations evaluations for : SGDRegressor




[0.28238692 0.26564975 0.27263521 0.27466754 0.26608052]
Testing validations evaluations for : SGDRegressor
{'r2 score': 0.16421818874099725, 'mean absolute error': 0.006636318555718803, 'mean sqaured error': 9.302374051510839e-05, 'training score': 0.27228398793869796, 'median absolute error': 0.006636318555718803, 'explained variance error': 0.16441970808967388, 'mean squared log error': 6.520521484964668e-05}
********************************************************************
Training validations evaluations for : BayesianRidge
[0.47328439 0.44453382 0.44320266 0.44396148 0.43084286]
Testing validations evaluations for : BayesianRidge
{'r2 score': 0.40868388356358676, 'mean absolute error': 0.005363077149319752, 'mean sqaured error': 6.581435039238538e-05, 'training score': 0.44716504051875283, 'median absolute error': 0.005363077149319752, 'explained variance error': 0.4088762391773876, 'mean squared log error': 4.626754492946323e-05}
**********************************************

In [20]:
clf.fit(xtrain, ytrain) 

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [21]:
preds = clf.predict(xtest)

In [22]:
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score

r2 = r2_score(ytest, preds, multioutput='uniform_average')
mae = median_absolute_error(ytest, preds)
msle =mean_squared_log_error(ytest, preds) 
mse = mean_squared_error(ytest, preds)
mabe = mean_absolute_error(ytest, preds)
evs = explained_variance_score(ytest, preds) 

In [23]:
print r2
print mae
print msle
print mse
print mabe
print evs

0.830426214156
0.434626625337
0.0355805487688
0.544091711132
0.55501256951
0.832091517462


In [37]:
#Without Ranking

clf2 = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
xtrain2 = xtrain.drop(['groupRank'], axis=1, inplace=False)
xtest2 = xtest.drop(['groupRank'], axis=1, inplace=False)

In [39]:
xtrain2.shape

(17496, 1738)

In [None]:
clf2.fit(xtrain2, ytrain)

In [None]:
preds2 = clf2.predict(xtest2)

In [None]:

r2_B = r2_score(ytest, preds2, multioutput='uniform_average')
mae_B = median_absolute_error(ytest, preds2)
msle_B =mean_squared_log_error(ytest, preds2) 
mse_B = mean_squared_error(ytest, preds2)
mabe_B = mean_absolute_error(ytest, preds2)
evs_B = explained_variance_score(ytest, preds2) 

In [27]:
#Output Without Group Ranking

print r2_B
print mae_B
print msle_B
print mse_B
print mabe_B
print evs_B

0.830426214156
1.01368228622
0.102643929117
2.22747772465
1.19279913019
0.309231285723


In [41]:
ytest['OP_WithRank']=preds
ytest['OP_With_OUT_Rank']=preds2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [42]:
ytest[0:10]

Unnamed: 0,Facebook,OP_WithRank,OP_With_OUT_Rank
34344,1.94591,2.3381,3.174727
39626,4.330733,4.833302,4.947679
19904,5.648974,5.48515,4.94009
65001,6.645091,6.954611,4.975507
16330,5.308268,5.023694,4.979165
56285,1.098612,2.682361,3.521087
59334,5.513429,5.619233,5.005762
70256,5.820083,5.900209,4.926259
70451,6.248043,6.263042,3.315377
80569,5.09375,5.233761,4.940583


In [43]:
ytest.to_csv('output.csv')