## 1. Importing Python Libraries:

In [114]:
import numpy as np
import pandas as pd
import csv
import mysql.connector
from datetime import datetime as dt
import matplotlib
from sklearn.linear_model import LinearRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, mean_squared_error as mse

## 2. Importing the Data Sets:

In [2]:
# Importing the Twitter Data:
twitter_df = pd.read_csv("../data/local_data/twitter/twitter.csv")

In [3]:
# Importing Industry Categories Table:
industries_df = pd.read_csv("../data/local_data/stocks/csv/industries.csv")
companies_df = pd.read_csv("../data/list_of_companies.csv")

In [4]:
# Importing the Stock Price Data:

stocks_df = pd.read_csv("../data/local_data/stocks/csv/stock_prices.csv")

## 3. Creating the Labeled Training Data Set:

In [5]:
# Filtering the Stock Price data frame:

company_ids = twitter_df.company_id.unique()
stocks_df = stocks_df[stocks_df.company_id.isin(company_ids)].dropna()

In [6]:
# Converting the date columns to Unix Epoch Time:

twitter_df["date"] = pd.to_datetime(twitter_df["date"]).values.astype(np.int64) // 10**6
twitter_df["date"] = (twitter_df["date"])//1000

stocks_df["date"] = pd.to_datetime(stocks_df["date"]).values.astype(np.int64) // 10**6
stocks_df["date"] = (stocks_df["date"] + 57600000)//1000

In [7]:
# Obtaining the date range for both data sets:

twitter_min_dates = twitter_df.groupby(["company_id"]).date.min()
twitter_max_dates = twitter_df.groupby(["company_id"]).date.max()

stocks_min_dates = stocks_df.groupby(["company_id"]).date.min()
stocks_max_dates = stocks_df.groupby(["company_id"]).date.max()

In [8]:
# Slicing the datasets based on the min/max dates: 

for id in company_ids:
    stocks_df = stocks_df.drop(stocks_df[((stocks_df["company_id"] == id) & (stocks_df["date"] < (twitter_min_dates[id] + 86400)))].index)
    stocks_df = stocks_df.drop(stocks_df[((stocks_df["company_id"] == id) & (stocks_df["date"] > (twitter_max_dates[id] - 86400)))].index)
    twitter_df = twitter_df.drop(twitter_df[((twitter_df["company_id"] == id) & (twitter_df["date"] < (stocks_min_dates[id] - 86400)))].index)
    twitter_df = twitter_df.drop(twitter_df[((twitter_df["company_id"] == id) & (twitter_df["date"] > (stocks_max_dates[id] + 86400)))].index)

In [19]:
rows = []

for id in company_ids:
    print(id)
    this_stock = stocks_df[stocks_df["company_id"] == id]
    this_twitter = twitter_df[twitter_df["company_id"] == id]
    this_ind_id = this_twitter["industry_id"].iloc[0]
    for index, row in this_stock.iterrows():
        end = row.date
        start = end - 86400
        day_twitter = twitter_df[(twitter_df["date"] >= start) & (twitter_df["date"] <= end)].mean()
        human_date = dt.fromtimestamp(row.date).strftime('%Y-%m-%d %H:%M:%S').split(" ")[0]
        rows.append({
            "company_id": row.company_id,
            "industry_id": this_ind_id,
            "date": human_date,
            "overall": day_twitter.overall_sentiment,
            "positive": day_twitter.positive_sentiment,
            "negative": day_twitter.negative_sentiment,
            "neutral": day_twitter.neutral_sentiment,
            "change_percent": row.change_percent
        }) 

23
24
21
7
8
9
10
53
54
55
56
57
58
59
60
61
62
41
43
44
45
46
47
48
49
50
51
2
3
4
5
64
65
66
67
117
68
69
122
71
72
12
13
14
15
16
17
18
19
20


In [20]:
training_data = pd.DataFrame.from_dict(rows, orient='columns')

In [25]:
display(training_data)

Unnamed: 0,company_id,industry_id,date,overall,positive,negative,neutral,change_percent
0,23,3,2017-01-17,0.192433,0.140295,0.040446,0.819267,-2.0636
1,23,3,2017-01-18,0.164156,0.133215,0.047739,0.819035,1.4047
2,23,3,2017-01-19,0.158477,0.130572,0.048721,0.820712,-1.9697
3,23,3,2017-01-20,0.172023,0.136606,0.045839,0.817556,0.2870
4,23,3,2017-01-23,0.154186,0.134041,0.051439,0.814524,-0.0440
...,...,...,...,...,...,...,...,...
37175,20,3,2020-02-27,0.196590,0.118910,0.051424,0.829673,-4.8972
37176,20,3,2020-02-28,0.216627,0.130245,0.049570,0.820192,-2.1627
37177,20,3,2020-03-02,0.207522,0.119663,0.048932,0.831396,3.0526
37178,20,3,2020-03-03,0.207606,0.119507,0.048603,0.831879,-5.5158


In [23]:
# Saving the training data set to a csv file:

training_data.to_csv(r'twitter_training_data.csv', index = False)

## 4. Building the Baselines:

In [131]:
# Spliting the data set into training and testing sets:

X = training_data[["positive", "negative", "neutral", "change_percent"]].dropna()
y = X["change_percent"]
y2 = (X["change_percent"] >= 0).astype(int)
del X["change_percent"]

X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=0.2, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=0)

### 4.1 Multi-linear Regression Model:

In [132]:
# Building the logistic Model:

lin_model = LinearRegression().fit(X_train, y_train)

# Getting the model's R^2 score:

score = lin_model.score(X_train, y_train)
score

0.00033901626376764415

In [133]:
# Predicting the outputs for the Testing Data Set:

y_pred = lin_model.predict(X_test)

# Getting the Mean Squared Error (MSE):

rmse_lin = mse(y_test,y_pred, squared=True)
rmse_lin

0.2491552081821679

In [134]:
# Calculating the F1-score:

f1_s = f1_score(y_test, y_pred, average='micro')
f1_s

ValueError: Classification metrics can't handle a mix of binary and continuous targets

### 4.2 Logistic Regression Model:

In [120]:
# Building the logistic Model Using 10-fold Cross Validation:

log_model = LogisticRegressionCV(cv=10, random_state=0).fit(X_train2, y_train2)

# Predicting the outputs for the Testing Data Set:

y_pred2 = log_model.predict(X_test2)

In [121]:
# Calculating the F1-score:

f1_s = f1_score(y_test2, y_pred2, average='micro')
f1_s

0.5319435104236718