In [2]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime
from pytz import timezone
import math
import matplotlib.pyplot as plt

In [3]:
reddit_df = pd.read_csv('../data/reddit_text_sentiment.csv')
stocks_df = pd.read_csv("../data/stock_prices.csv")
companies_df = pd.read_csv("../data/companies.csv")
brands_df = pd.read_csv("../data/brands.csv")
industries_df = pd.read_csv("../data/industries.csv")

In [4]:
#helper function to convert utc to EST date
eastern = timezone('US/Eastern')
def utc_to_est(utc):
    return datetime.fromtimestamp(utc, tz = eastern)

In [5]:
#create new stocks column with datetime format of daily market close times
stocks_df["date"] = pd.to_datetime(stocks_df["date"]).values.astype(np.int64) // 10**6
stocks_df["date"] = (stocks_df["date"] + 57600000)//1000
stocks_df['datetime'] = stocks_df['date'].apply(utc_to_est)

In [6]:
#created new reddit column with datetime format of daily market close times
reddit_df['datetime'] = reddit_df['created_utc'].apply(utc_to_est)

In [7]:
#select relevant columns
reddit_df = reddit_df[['subreddit', 'datetime', 'score', 'compound', 'positive', 'neutral', 'negative']]

#add column that helps in counting posts when grouped
reddit_df['num_posts'] = 1
reddit_df['positive_posts'] = reddit_df['compound'] > 0.3
reddit_df['negative_posts'] = reddit_df['compound'] < -0.3

#replace zeros with insubstatial float in reddit scores
reddit_df.score = reddit_df.score.apply(lambda x: max(x, 0.01))

#weighted scores
for s in ['compound', 'negative', 'positive', 'neutral']:
    reddit_df['weighted_{0}'.format(s)] = reddit_df[s]*reddit_df['score'].apply(lambda x: math.log(x+1))
    
#add company_id as column
reddit_df = reddit_df.merge(brands_df[['subreddit', 'company_id']], on='subreddit')

In [8]:
#helper function that takes a dataframe and returns separate dataframes for each company
def separate_reddit(reddit_df, by='all'):
    if by not in ['all', 'company', 'industry']:
        print('argument invalid: must be = <all>, <company>, or <industry>')
        pass
    else:
        if by == 'industry':
            temp = reddit_df.merge(companies_df[['id', 'industry_id']], left_on = 'company_id', right_on='id')
            return [temp[temp['industry_id']==i][['industry_id', 'datetime','weighted_compound', 'weighted_negative', 'weighted_positive', 'weighted_neutral', 'num_posts', 'positive_posts', 'negative_posts']] for i in temp.industry_id.unique()]
        elif by == 'company':
            return [reddit_df[reddit_df['company_id']==i][['company_id', 'datetime','weighted_compound', 'weighted_negative', 'weighted_positive', 'weighted_neutral', 'num_posts', 'positive_posts', 'negative_posts']] for i in reddit_df.company_id.unique()]
        else:
            return [reddit_df[['company_id', 'datetime','weighted_compound', 'weighted_negative', 'weighted_positive', 'weighted_neutral', 'num_posts', 'positive_posts', 'negative_posts']]]

In [9]:
#groups data for a single company using sliding window - number of days specified in call
def group_data(df, by='all', days = 1):
    if by not in ['all', 'company', 'industry']:
        print('invalid arg: must be in [all, company, industry]')
        pass
    else:
        if by=='industry':
            i_id = df.industry_id.unique()[0] #save industry_id
            temp_df = df[['datetime', 'weighted_compound', 'weighted_positive', 'weighted_neutral', 'weighted_negative', 'num_posts', 'positive_posts', 'negative_posts']].groupby(pd.Grouper(key='datetime', freq='24h', base=11, label='right')).sum() #groupby day
            min_date = min(temp_df.index)
            max_date = max(temp_df.index)
            date_idx = [i for i in pd.date_range(min_date, max_date)] #new index
            temp_df = temp_df.reindex(date_idx).fillna(0).rolling(days).sum()[days-1:] #make dataframe with rolling window sum
            temp_df['industry_id'] = i_id #restore industry_id
            temp_df.reset_index(inplace=True)
            return temp_df
        elif by=='company':
            c_id = df.company_id.unique()[0] #save company_id
            temp_df = df[['datetime', 'weighted_compound', 'weighted_positive', 'weighted_neutral', 'weighted_negative', 'num_posts', 'positive_posts', 'negative_posts']].groupby(pd.Grouper(key='datetime', freq='24h', base=11, label='right')).sum() #groupby day
            min_date = min(temp_df.index)
            max_date = max(temp_df.index)
            date_idx = [i for i in pd.date_range(min_date, max_date)] #new index
            temp_df = temp_df.reindex(date_idx).fillna(0).rolling(days).sum()[days-1:] #make dataframe with rolling window sum
            temp_df['company_id'] = c_id #restore company_id
            temp_df.reset_index(inplace=True)
            return temp_df
        else:
            return pd.concat([group_data(d, 'company', days) for d in separate_reddit(df, 'company')])
    
    

In [10]:
def make_dataframes(df, separation='all', lookback=1, min_date = None, max_date = None):
    if min_date:
        df = df[df['datetime'] >= pd.to_datetime(min_date).tz_localize('US/Eastern')]
    if max_date:
        df = df[df['datetime'] <= pd.to_datetime(max_date).tz_localize('US/Eastern')]
    dfs = []
    separated = separate_reddit(df, separation)
    for d in separated:
        temp = group_data(d, separation, lookback)
        for i in ['weighted_compound', 'weighted_negative', 'weighted_positive', 'weighted_neutral', 'positive_posts', 'negative_posts']: #iterate over scores columns
            temp['avg_{}'.format(i)] = temp[i]/temp['num_posts'].apply(lambda x: max(x, 1)) #set avgerage weighted scores columns
        temp = temp.merge(stocks_df[['company_id', 'datetime', 'change_percent']], on=['company_id', 'datetime']) #add change_percent column
        dfs.append(temp)
    return dfs
        

In [11]:
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [12]:
#helper function that finds optimal C for a given training set (logistic regression)
def cross_validate_logreg(X_train, y_train, c_s = [0.1, 0.3, 1, 3, 10, 30, 100]):
    c_s = [0.1, 0.3, 1, 3, 10, 30, 100]
    lg_cv = LogisticRegressionCV(Cs = c_s, scoring='f1').fit(X_train, y_train)
    return c_s[np.argmax(np.mean(lg_cv.scores_[True],axis=0))], np.max(np.mean(lg_cv.scores_[True], axis=0))

In [13]:
def logreg_by_lookback(reddit_df, lookback = [1], c_s = [0.1, 0.3, 1, 3, 10, 30, 100]):
    for i in lookback:
        full_df = make_dataframes(reddit_df, lookback = i, min_date = '2014-03-01')[0]
        X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
        y = full_df['change_percent'] > 0
        X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state = 6240)
        best_c, f1_score = cross_validate_logreg(X_train, y_train, c_s)
        print('Best f1_score = {0} for {1} day lookback with C = {2}'.format(f1_score, i, best_c))

In [14]:
logreg_by_lookback(reddit_df, lookback = [1, 3, 6, 10, 15])

Best f1_score = 0.6837936312061542 for 1 day lookback with C = 0.1
Best f1_score = 0.6847001390207956 for 3 day lookback with C = 0.1
Best f1_score = 0.6821921219983137 for 6 day lookback with C = 0.1
Best f1_score = 0.6840706556579365 for 10 day lookback with C = 0.1
Best f1_score = 0.6854336111299288 for 15 day lookback with C = 0.3


In [21]:
#test out best logistic regression model
full_df = make_dataframes(reddit_df, lookback = 15, min_date = '2014-03-01')[0]
X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts']]
y = full_df['change_percent'] > 0
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state = 6240)
lg = LogisticRegression(C=0.3).fit(X_train, y_train)
lg_preds = lg.predict(X_test)
f1 = f1_score(lg_preds, y_test)
precision = precision_score(lg_preds, y_test)
recall = recall_score(lg_preds, y_test)
accuracy = accuracy_score(lg_preds, y_test)
print('f1: {0}, precision: {1}, recall: {2}, accuracy: {3}'.format(f1, precision, recall, accuracy))

f1: 0.6860539163659521, precision: 1.0, recall: 0.5221324717285946, accuracy: 0.5221324717285946


In [22]:
##Baseline tests for linear regression
lookback_days = [1, 3, 6, 10, 15]
for i in lookback_days:
    full_df = make_dataframes(reddit_df, lookback = i, min_date = '2014-03-01')[0]
    X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts']]
    y = full_df['change_percent']
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state = 6240)
    ln = LinearRegression(normalize=True).fit(X_train, y_train)
    ln_pred = ln.predict(X_test)
    ln_f1 = f1_score((ln_pred > 0), y_test >0)
    ln_precision = precision_score((ln_pred > 0), y_test>0)
    ln_recall  = recall_score(ln_pred>0, y_test>0)
    ln_accuracy = accuracy_score(ln_pred>0, y_test>0)
    print('{0} day lookup:'.format(i))
    print('f1: {0}, precision: {1}, recall: {2}, accuracy: {3}'.format(ln_f1, ln_precision, ln_recall, ln_accuracy))

1 day lookup:
f1: 0.6690469651194727, precision: 0.9354838709677419, recall: 0.5207353569901667, accuracy: 0.5166466105094264
3 day lookup:
f1: 0.6810666666666667, precision: 0.9762996941896025, recall: 0.5229320229320229, accuracy: 0.5200642054574639
6 day lookup:
f1: 0.6049423393739704, precision: 0.6902255639097744, recall: 0.5384164222873901, accuracy: 0.5171163914619412
10 day lookup:
f1: 0.6879014989293363, precision: 0.9801678108314263, recall: 0.5298969072164949, accuracy: 0.5292692773516351
15 day lookup:
f1: 0.6773579777113345, precision: 0.9614197530864198, recall: 0.5228703315148971, accuracy: 0.5205977382875606


In [17]:
#Random Forest
lookback_days = [1, 3, 6, 10, 15, 21, 28]
for i in lookback_days:
    full_df = make_dataframes(reddit_df, lookback = i, min_date = '2014-03-01')[0]
    X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts']]
    y = full_df['change_percent'] > 0
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state = 6240)
    rf = RandomForestClassifier(n_estimators =100)
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    f1 = f1_score(preds, y_test)
    precision = precision_score(preds, y_test)
    recall = recall_score(preds, y_test)
    accuracy = accuracy_score(preds, y_test)
    print('{0} day lookback:'.format(i))
    print('f1: {0}, precision: {1}, recall: {2}, accuracy: {3}'.format(f1, precision, recall, accuracy))

1 day lookback:
f1: 0.595617529880478, precision: 0.6889400921658986, recall: 0.5245614035087719, accuracy: 0.5114320096269555
3 day lookback:
f1: 0.5839172505839173, precision: 0.668960244648318, recall: 0.5180580224985198, accuracy: 0.4995987158908507
6 day lookback:
f1: 0.5924439986626547, precision: 0.6661654135338346, recall: 0.5334136062612884, accuracy: 0.509061619009263
10 day lookback:
f1: 0.5748138117806364, precision: 0.6475972540045767, recall: 0.516737674984784, accuracy: 0.4929350020185709
15 day lookback:
f1: 0.5751189666893269, precision: 0.6527777777777778, recall: 0.5139732685297691, accuracy: 0.49515347334410337
21 day lookback:
f1: 0.5778546712802767, precision: 0.647788983708301, recall: 0.5215490318550906, accuracy: 0.5072697899838449
28 day lookback:
f1: 0.5765199161425576, precision: 0.6400310318076028, recall: 0.5244755244755245, accuracy: 0.5105008077544426


In [23]:
#XG Boost
lookback_days = [1, 3, 6, 10, 15]
for i in lookback_days:
    full_df = make_dataframes(reddit_df, lookback = i, min_date = '2014-03-01')[0]
    X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts']]
    y = full_df['change_percent'] > 0
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state = 6240)
    xg = GradientBoostingClassifier()
    xg.fit(X_train, y_train)
    preds = xg.predict(X_test)
    f1 = f1_score(preds, y_test)
    precision = precision_score(preds, y_test)
    recall = recall_score(preds, y_test)
    accuracy = accuracy_score(preds, y_test)
    print('{0} day lookback:'.format(i))
    print('f1: {0}, precision: {1}, recall: {2}, accuracy: {3}'.format(ln_f1, ln_precision, ln_recall, ln_accuracy))

1 day lookback:
f1: 0.6773579777113345, precision: 0.9614197530864198, recall: 0.5228703315148971, accuracy: 0.5205977382875606
3 day lookback:
f1: 0.6773579777113345, precision: 0.9614197530864198, recall: 0.5228703315148971, accuracy: 0.5205977382875606
6 day lookback:
f1: 0.6773579777113345, precision: 0.9614197530864198, recall: 0.5228703315148971, accuracy: 0.5205977382875606
10 day lookback:
f1: 0.6773579777113345, precision: 0.9614197530864198, recall: 0.5228703315148971, accuracy: 0.5205977382875606
15 day lookback:
f1: 0.6773579777113345, precision: 0.9614197530864198, recall: 0.5228703315148971, accuracy: 0.5205977382875606


In [38]:
len(reddit_df.company_id.unique())

30

In [64]:
#create array of evaluation metrics by company with random forest
rf_company_results = np.zeros((30, 5))
full_df = make_dataframes(reddit_df, lookback = 28, min_date = '2014-03-01')[0]
full_df = full_df[full_df.num_posts >0]
X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
y = full_df['change_percent'] > 0
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state = 6240)
rf = RandomForestClassifier(n_estimators =100)
rf.fit(X_train, y_train)
preds = rf.predict(X_test)
f1 = f1_score(preds, y_test)
precision = precision_score(preds, y_test)
recall = recall_score(preds, y_test)
accuracy = accuracy_score(preds, y_test)
dfs = make_dataframes(reddit_df, 'company', lookback = i, min_date = '2014-03-01')
print('overall scores: f1:{0}, precision: {1}, recall:{2}, accuracy{3}'.format(f1, precision, recall, accuracy))
i = 0
for df in dfs:
    c_id = df.company_id[0]
    X = df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
    y = df['change_percent'] > 0
    preds = rf.predict(X)
    f1 = f1_score(preds, y)
    precision = precision_score(preds, y)
    recall = recall_score(preds, y)
    accuracy = accuracy_score(preds, y)
    rf_company_results[i,] = [c_id, f1, precision, recall, accuracy]
    i+=1
        
        

overall scores: f1:0.5384615384615384, precision: 0.5535535535535535, recall:0.524170616113744, accuracy0.4960127591706539


In [77]:
#see performance of Random Forest by company
#each row shows : (company id, f1, precision, recall, accuracy)
rf_df = pd.DataFrame(rf_company_results, columns = ['company_id', 'f1', 'precision', 'recall', 'accuracy'])
companies_df[['id', 'name']].merge(rf_df, left_on='id', right_on='company_id').drop(columns ='id')

Unnamed: 0,name,company_id,f1,precision,recall,accuracy
0,Microsoft Corporation,1.0,0.669246,0.64794,0.692,0.666016
1,Apple Inc.,2.0,0.70632,0.698529,0.714286,0.691406
2,"Amazon.com, Inc.",3.0,0.726944,0.723022,0.730909,0.705078
3,"Facebook, Inc.",4.0,0.715867,0.72119,0.710623,0.699219
4,Alphabet Inc,5.0,0.6787,0.681159,0.676259,0.652344
5,eBay Inc.,6.0,0.736059,0.758621,0.714801,0.722656
6,"Twitter, Inc.",7.0,0.69703,0.6875,0.706827,0.701172
7,HP Inc.,8.0,0.668501,0.920455,0.524838,0.529297
8,Dell Technologies Inc.,9.0,0.77193,0.802083,0.743961,0.72997
9,"Uber Technologies, Inc.",10.0,0.712644,0.704545,0.72093,0.698795


In [52]:
#create array of performance metrics of logistic regression by company
lg_company_results = np.zeros((30, 5))
full_df = make_dataframes(reddit_df, lookback = 15, min_date = '2014-03-01')[0]
full_df = full_df[full_df.num_posts >0]
X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
y = full_df['change_percent'] > 0
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state = 6240)
lg = LogisticRegression()
lg.fit(X_train, y_train)
preds = lg.predict(X_test)
f1 = f1_score(preds, y_test)
precision = precision_score(preds, y_test)
recall = recall_score(preds, y_test)
accuracy = accuracy_score(preds, y_test)
dfs = make_dataframes(reddit_df, 'company', lookback = i, min_date = '2014-03-01')
i=0
for df in dfs:
    c_id = df.company_id[0]
    X = df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
    y = df['change_percent'] > 0
    preds = lg.predict(X)
    f1 = f1_score(preds, y)
    precision = precision_score(preds, y)
    recall = recall_score(preds, y)
    accuracy = accuracy_score(preds, y)
    lg_company_results[i,] = [c_id, f1, precision, recall, accuracy]
    i+=1

In [76]:
#see logistic regression results by company
lg_df = pd.DataFrame(lg_company_results, columns = ['company_id', 'f1', 'precision', 'recall', 'accuracy'])
companies_df[['id', 'name']].merge(lg_df, left_on='id', right_on='company_id').drop(columns ='id')

Unnamed: 0,name,company_id,f1,precision,recall,accuracy
0,Microsoft Corporation,1.0,0.685494,1.0,0.521484,0.521484
1,Apple Inc.,2.0,0.693878,1.0,0.53125,0.53125
2,"Amazon.com, Inc.",3.0,0.703797,1.0,0.542969,0.542969
3,"Facebook, Inc.",4.0,0.68886,1.0,0.525391,0.525391
4,Alphabet Inc,5.0,0.700508,1.0,0.539062,0.539062
5,eBay Inc.,6.0,0.675291,1.0,0.509766,0.509766
6,"Twitter, Inc.",7.0,0.666667,1.0,0.5,0.5
7,HP Inc.,8.0,0.680412,1.0,0.515625,0.515625
8,Dell Technologies Inc.,9.0,0.725898,1.0,0.569733,0.569733
9,"Uber Technologies, Inc.",10.0,0.692913,1.0,0.53012,0.53012


In [78]:
#gradient boost
xg_company_results = np.zeros((30, 5))
full_df = make_dataframes(reddit_df, lookback = 28, min_date = '2014-03-01')[0]
full_df = full_df[full_df.num_posts >0]
X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
y = full_df['change_percent'] > 0
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state = 6240)
xg = GradientBoostingClassifier(n_estimators= 100)
xg.fit(X_train, y_train)
preds = xg.predict(X_test)
f1 = f1_score(preds, y_test)
precision = precision_score(preds, y_test)
recall = recall_score(preds, y_test)
accuracy = accuracy_score(preds, y_test)
dfs = make_dataframes(reddit_df, 'company', lookback = i, min_date = '2014-03-01')
print('overall scores: f1:{0}, precision: {1}, recall:{2}, accuracy{3}'.format(f1, precision, recall, accuracy))
i=0
for df in dfs:
    c_id = df.company_id[0]
    X = df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
    y = df['change_percent'] > 0
    preds = xg.predict(X)
    f1 = f1_score(preds, y)
    precision = precision_score(preds, y)
    recall = recall_score(preds, y)
    accuracy = accuracy_score(preds, y)
    xg_company_results[i,] = [c_id, f1, precision, recall, accuracy]
    i+=1

overall scores: f1:0.628087986463621, precision: 0.7347585114806018, recall:0.5484633569739953, accuracy0.5327380952380952


In [79]:
#xg boost results by company
xg_df = pd.DataFrame(xg_company_results, columns = ['company_id', 'f1', 'precision', 'recall', 'accuracy'])
companies_df[['id', 'name']].merge(xg_df, left_on='id', right_on='company_id').drop(columns ='id')

Unnamed: 0,name,company_id,f1,precision,recall,accuracy
0,Microsoft Corporation,1.0,0.663717,0.842697,0.547445,0.554688
1,Apple Inc.,2.0,0.672439,0.856618,0.553444,0.556641
2,"Amazon.com, Inc.",3.0,0.691892,0.920863,0.554113,0.554688
3,"Facebook, Inc.",4.0,0.677551,0.925651,0.534335,0.537109
4,Alphabet Inc,5.0,0.674253,0.858696,0.555035,0.552734
5,eBay Inc.,6.0,0.599315,0.670498,0.541796,0.542969
6,"Twitter, Inc.",7.0,0.543651,0.535156,0.552419,0.550781
7,HP Inc.,8.0,0.671159,0.943182,0.520921,0.523438
8,Dell Technologies Inc.,9.0,0.719682,0.942708,0.581994,0.581602
9,"Uber Technologies, Inc.",10.0,0.626263,0.704545,0.563636,0.554217


In [56]:
#linear regression
ln_company_results = np.zeros((30, 5))
full_df = make_dataframes(reddit_df, lookback = i, min_date = '2014-03-01')[0]
full_df = full_df[full_df.num_posts >0]
X = full_df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
y = full_df['change_percent']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state = 6240)
ln = LinearRegression()
ln.fit(X_train, y_train)
preds = ln.predict(X_test) > 0
f1 = f1_score(preds, y_test > 0)
precision = precision_score(preds, y_test > 0)
recall = recall_score(preds, y_test > 0)
accuracy = accuracy_score(preds, y_test > 0)
dfs = make_dataframes(reddit_df, 'company', lookback = i, min_date = '2014-03-01')
i=0
for df in dfs:
    c_id = df.company_id[0]
    X = df[['avg_weighted_compound', 'avg_weighted_positive', 'avg_weighted_negative', 'avg_weighted_neutral', 'num_posts', 'avg_positive_posts', 'avg_negative_posts']]
    y = df['change_percent']
    preds = ln.predict(X) > 0
    f1 = f1_score(preds, y>0)
    precision = precision_score(preds, y>0)
    recall = recall_score(preds, y>0)
    accuracy = accuracy_score(preds, y>0)
    ln_company_results[i,] = [c_id, f1, precision, recall, accuracy]
    i+=1

In [74]:
ln_df = pd.DataFrame(ln_company_results, columns = ['company_id', 'f1', 'precision', 'recall', 'accuracy'])
companies_df[['id', 'name']].merge(ln_df, left_on='id', right_on='company_id').drop(columns ='id')

Unnamed: 0,name,company_id,f1,precision,recall,accuracy
0,Microsoft Corporation,1.0,0.685494,1.0,0.521484,0.521484
1,Apple Inc.,2.0,0.693878,1.0,0.53125,0.53125
2,"Amazon.com, Inc.",3.0,0.703797,1.0,0.542969,0.542969
3,"Facebook, Inc.",4.0,0.68886,1.0,0.525391,0.525391
4,Alphabet Inc,5.0,0.700508,1.0,0.539062,0.539062
5,eBay Inc.,6.0,0.672727,0.992337,0.508841,0.507812
6,"Twitter, Inc.",7.0,0.669281,1.0,0.502947,0.505859
7,HP Inc.,8.0,0.672021,0.950758,0.519669,0.521484
8,Dell Technologies Inc.,9.0,0.701754,0.9375,0.560748,0.545994
9,"Uber Technologies, Inc.",10.0,0.698413,1.0,0.536585,0.542169


In [68]:
companies_df

Unnamed: 0,id,name,hq_city,hq_state,year_founded,year_ipo,industry_id,stock_ticker,stock_exchange
0,1,Microsoft Corporation,Redmond,WA,1975,1986,1,MSFT,NASDAQ
1,2,Apple Inc.,Cupertino,CA,1976,1980,1,AAPL,NASDAQ
2,3,"Amazon.com, Inc.",Seattle,WA,1994,1997,1,AMZN,NASDAQ
3,4,"Facebook, Inc.",Menlo Park,CA,2004,2012,1,FB,NASDAQ
4,5,Alphabet Inc,Mountain View,CA,1998,2004,1,GOOGL,NASDAQ
...,...,...,...,...,...,...,...,...,...
117,118,Fiat Chrysler Automobiles,Amsterdam,Netherlands,1899,2010,17,FCAU,NYSE
118,119,"Honda Motor Co., Ltd.",Tokyo,Japan,1946,1977,17,HMC,NYSE
119,120,Toyota Motor Corporation,Toyota City,Japan,1937,1999,17,TM,NYSE
120,121,Grubhub Inc.,Chicago,IL,2004,2014,1,GRUB,NYSE
