# Stock Related Tweets Cleaning

In [1]:
import numpy as np
import pandas as pd
import spacy
import csv

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
df = pd.DataFrame(columns=["month", "day", "Snippet", "Impact", "Mentioned Authors"])

for index in range(2, 13):
    filename = "rawdata/TwitterMentions{}.csv".format(index)
    temp_df = pd.read_csv(filename, header=0, usecols = ["Date", "Snippet", "Impact", "Mentioned Authors"], encoding="latin1")
    temp_df = temp_df.assign(month=pd.to_datetime(temp_df["Date"]).dt.month, day=pd.to_datetime(temp_df["Date"]).dt.day)
    temp_df = temp_df[["month", "day", "Snippet", "Impact", "Mentioned Authors"]]
    df = pd.concat([df, temp_df], ignore_index=True)

In [6]:
df.shape

(95000, 5)

In [7]:
selected_company = ["facebook", "amazon", "tesla", "netflix", "apple", "google"]
for comp in selected_company:
    df_cp = df.loc[df["Mentioned Authors"].str.contains(comp, na=False)]
    for month in df_cp["month"].unique():
        df_mo = df_cp.loc[df_cp["month"]==month]
        for day in df_mo["day"].unique():
            df_final = df_mo.loc[df_mo["day"]==day]
            words = np.array([nlp(d).vector for d in df_final["Snippet"]])
            np.savez("Tweets_cleaned/comment_vectors/{}-{}-{}".format(comp, month, day), inputs=words)
            np.savetxt("Tweets_cleaned/impact_weights/{}-{}-{}.txt".format(comp, month, day), df_final["Impact"].to_numpy())

In [None]:
df.loc[(df["month"]==10) & (df["day"]==15) & (df["Mentioned Authors"].str.contains("amazon"))]

In [7]:
num = []
date = []
for month in range(1,13):
    for day in range(1, 32):
        num.append(len(df.loc[(df.month==month) & (df.day==day)]))
        date.append("{}/{}/2020".format(month, day))

In [13]:
data_csv = pd.DataFrame(data=np.array([date, num]).T, columns=["date", "number_of_tweets_collected"])
data_csv.to_csv("number_of_tweets_collected.csv")