In [1]:
import re
import nltk
import pandas as pd
import string
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('reddit_data.csv', header = None)
df.columns = ['time','username','text','ticker']

In [3]:
df.head()

Unnamed: 0,time,username,text,ticker
0,1613442839,staywokeordie,"['Michael', 'Burry', 'says', 'gouge', 'an', 'e...",TSLA
1,1613438992,StonkWonker,"['TSLA', 'TO', 'MARS', '🚀🚀🚀']",TSLA
2,1613438228,Certain-Ebb-9411,"['Michael', 'Burry', 'deleted', 'tweets', 'abo...",TSLA
3,1613437991,setoxxx,"['True', 'Story', 'of', 'a', 'Redditor', 'that...",TSLA
4,1613436360,shadyshit123,"['Thoughts', 'on', 'the', 'stock', 'which', 'S...",TSLA


In [4]:
def clean_text(text_list):
    
    # The text column looks like a list but is in string format. 
    #Remove all unnecessary symbols, lowercase everything, and make a list of words
    text_list = ''.join(i for i in text_list if i.isalpha() or i in [' ']).lower().split()

    # Remove all other punctuations.
    punct = string.punctuation
    for word in text_list:
        word = ''.join(i for i in word if i not in punct)
        
    # Remove stop words that dont add value to sentiment analysis
    stop_words = stopwords.words('english')
    stop_words = set(stop_words)
    text_list = [word for word in text_list if word not in stop_words]
    
    # Remove the words we know are in the text which adds nothing to sentiment analysis
    # We know these words are here because they were the keywords we used to filter texts
    text_list = [word for word in text_list if word not in ['tsla','elon','musk']]
    
    # stemming the words to reduce them to core words
    ps = nltk.PorterStemmer()
    text_list = [ps.stem(word) for word in text_list]
    return text_list

df['text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

Unnamed: 0,time,username,text,ticker
0,1613442839,staywokeordie,"[michael, burri, say, goug, eye, short, dog, c...",TSLA
1,1613438992,StonkWonker,[mar],TSLA
2,1613438228,Certain-Ebb-9411,"[michael, burri, delet, tweet]",TSLA
3,1613437991,setoxxx,"[true, stori, redditor, sold, gme]",TSLA
4,1613436360,shadyshit123,"[thought, stock, sir, push, cant, say, name, t...",TSLA


In [5]:
def fix_words(list_BOW):
    for i in range(len(list_BOW)):
#         print(list_BOW[i])
        list_BOW[i] = re.sub(r'^.*?=', '', list_BOW[i])
    return(list_BOW)


def make_db(BOW_filename):
    with open(BOW_filename,'r') as read_file:
        content = read_file.read()
        read_file.close
    list_stuff = list(re.split('\n', content))
    for i in range(len(list_stuff)):
        list_stuff[i] = list(re.split(' ', list_stuff[i]))
        list_stuff[i] = fix_words(list_stuff[i])
    return list_stuff

abcd = make_db("subjclueslen1-HLTEMNLP05.tff")

# df2 = pd.DataFrame(list_stuff, columns=['type','len','word','pos','stemmed','sentiment']) 
# df2.head()


In [6]:
df2 = pd.DataFrame(abcd, columns=['type','len','word','pos','stemmed','sentiment','x']) 


In [7]:
df2.drop(['x'], axis = 1, inplace=True)
df2.head()


Unnamed: 0,type,len,word,pos,stemmed,sentiment
0,weaksubj,1,abandoned,adj,n,negative
1,weaksubj,1,abandonment,noun,n,negative
2,weaksubj,1,abandon,verb,y,negative
3,strongsubj,1,abase,verb,y,negative
4,strongsubj,1,abasement,anypos,y,negative


In [8]:
def stem(df):
    positive = set()
    negative = set()
    ps = nltk.PorterStemmer()
    for i in range(len(df)):
        if df.iloc[i]['stemmed'] == 'n':
            df.iloc[i]['word'] = ps.stem(df.iloc[i]['word'])
        if df.iloc[i]['sentiment'] == 'negative':
            negative.add(df.iloc[i]['word'])
        if df.iloc[i]['sentiment'] == 'positive':
            positive.add(df.iloc[i]['word'])
    return list(positive), list(negative)

In [9]:
p,n = stem(df2)

In [10]:
def gen_sentiment(df,pos_list,neg_list):
    final = df.copy()
    final['sentiment'] = 0
    final['value'] = None
    for i in df.index:
        count_pos = 0
        count_neg = 0
        for j in df.at[i,'text']:
            if j in pos_list:
                count_pos +=1
            if j in neg_list:
                count_neg +=1
        if count_neg == 0 and count_pos == 0:
            final.at[i,'sentiment'] = 0
        if count_neg > count_pos:
            final.at[i,'sentiment'] =-1
        if count_neg < count_pos:
            final.at[i,'sentiment'] = 1
        final.at[i,'value'] = (count_pos,count_neg)
    return final
    

In [11]:
final = gen_sentiment(df,p,n)

In [12]:
df.head()

Unnamed: 0,time,username,text,ticker
0,1613442839,staywokeordie,"[michael, burri, say, goug, eye, short, dog, c...",TSLA
1,1613438992,StonkWonker,[mar],TSLA
2,1613438228,Certain-Ebb-9411,"[michael, burri, delet, tweet]",TSLA
3,1613437991,setoxxx,"[true, stori, redditor, sold, gme]",TSLA
4,1613436360,shadyshit123,"[thought, stock, sir, push, cant, say, name, t...",TSLA


In [13]:
final.head()

Unnamed: 0,time,username,text,ticker,sentiment,value
0,1613442839,staywokeordie,"[michael, burri, say, goug, eye, short, dog, c...",TSLA,-1,"(0, 2)"
1,1613438992,StonkWonker,[mar],TSLA,-1,"(0, 1)"
2,1613438228,Certain-Ebb-9411,"[michael, burri, delet, tweet]",TSLA,0,"(0, 0)"
3,1613437991,setoxxx,"[true, stori, redditor, sold, gme]",TSLA,1,"(1, 0)"
4,1613436360,shadyshit123,"[thought, stock, sir, push, cant, say, name, t...",TSLA,1,"(1, 0)"


In [14]:
len(final)

807

In [15]:
len(final.username.unique())

624