# Models to use for prototype

#### This will help us work with our data easier

In [76]:
class Chat: 
    def __init__(self, chat, label):
        self.chat = chat
        self.label = label

#### Append all of our data to a single tuple to be used with our model

In [77]:
hypechat = './comments/comments.txt'
hypelabels = './labels/hypelabel.txt'
normalchat = './comments/noexcite.txt'
normallabels = './labels/nohypelabel.txt'

hype = []

# Append both our hype data and normal data into a single tuple for 
with open(hypechat, encoding="utf-8") as h, open(hypelabels, encoding="utf-8") as l:
    for line, label in zip(h, l):
        hype.append(Chat(line.strip(), label.strip()))
print(len(hype))
with open(normalchat, encoding="utf-8") as h, open(normallabels, encoding="utf-8") as l:
    for line, label in zip(h, l):
        hype.append(Chat(line.strip(), label.strip()))
print(len(hype))


1025
2200


In [78]:
hype[58].chat

'WWWWWWWWWW'

#### Split our train and test data

In [79]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(hype, test_size=0.20, random_state=42, shuffle=True)

In [80]:
train_x = [x.chat for x in train]
train_y = [x.label for x in train]

test_x = [x.chat for x in test]
test_y = [x.label for x in test]



#### Create bag of words vectors

In [81]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

train_xv = vectorizer.fit_transform(train_x)

test_xv = vectorizer.transform(test_x)

In [82]:
#print(train_x)

### Classification

#### We will try using Logistic Regression & Naive Bayes and compare our results 

In [83]:
from sklearn.linear_model import LogisticRegression

clf_l = LogisticRegression(random_state=0).fit(train_xv, train_y)

# Random sample
print(test_x[38])
print(clf_l.predict(test_xv[38]))

#for i in range(len(test_x)):
#    print(test_x[i])
#    print(clf_l.predict(test_xv[i]))

AAHAHAHA
['hype']


### Naive bayes that we will train and use with our program.

In [84]:
import numpy as np
from sklearn.naive_bayes import BernoulliNB

clfNB = BernoulliNB()

clfNB.fit(train_xv.todense(), train_y)

print(test_x[38])
print(clfNB.predict(test_xv[38]))








AAHAHAHA
['hype']


In [85]:
print(clf_l.score(test_xv, test_y))
print(clfNB.score(test_xv, test_y))

0.8204545454545454
0.759090909090909


### F1 Score

#### Logistic Regression F1 Score

In [86]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_l.predict(test_xv), average=None, labels =["hype", "not hype"])


array([0.81755196, 0.82326622])

#### Naive Bayes F1 Score

In [87]:
f1_score(test_y, clfNB.predict(test_xv), average=None, labels =["hype", "not hype"])

array([0.78367347, 0.72820513])

In [88]:
from os import listdir
from os.path import isfile, join
 
hype_files = [f for f in listdir('./hypstreams/') if isfile(join('./hypstreams/', f))]
reg_files  = [f for f in listdir('./regstreams/') if isfile(join('./regstreams/', f))]


In [94]:
class Stream():
    def __init__(self, name, comments,
                 stream_label='hype',
                 predictions=[], states=[]):
        self.name = name 
        self.comments = comments
        self.stream_label = stream_label
        self.predictions = predictions
        self.states = states

    def printStateResults(self):
        # majority state
        majority = 'not hype'
        count_hype_state = len([ x for x in self.states if x=='hype' ]) 
        count_not_state = len(self.states) - count_hype_state
        if count_hype_state > count_not_state:
            majority = 'hype'
        per_hype = count_hype_state/len(self.states)
        print(self.name, self.stream_label, majority, per_hype, count_hype_state, sep=',')
        
    
    def printPredictionResults(self):
        # majority prediction
        majority = 'not hype'
        count_hype_prediction = len([ x for x in self.predictions if x=='hype' ]) 
        count_not_prediction = len(self.predictions) - count_hype_prediction
        if count_hype_prediction > count_not_prediction:
            majority = 'hype'
        per_hype = count_hype_prediction/len(self.predictions)
        print(self.name, self.stream_label, majority, per_hype, count_hype_prediction, sep=',')
        
        

In [113]:
def setStreamStates(stream, threshold=10):
    hype_count = 0
    states = []
    hype_counts = []
    for p in stream.predictions:
        # increment the count
        if p == 'hype':
            hype_count += 1
        else:
            if hype_count > 0:
                hype_count -= 1
        # set the state
        if hype_count >= threshold:
            state = 'hype'
        else:
            state = 'not hype'
        states.append(state)
        hype_counts.append(hype_count)
        
    stream.states=states
    stream.hype_counts=hype_counts
         
        

In [114]:
hype_streams = []
reg_streams = []
# loop over our streams and build stream objects
for fname in hype_files:
    with open(join('./hypstreams/', fname)) as f:
        comments = f.readlines()
        vectorized = vectorizer.transform(comments)
        predictions = clf_l.predict(vectorized)
        hype_streams.append(Stream(fname, comments, stream_label='hype', predictions=predictions))

for fname in reg_files:
    with open(join('./regstreams/', fname)) as f:
        comments = f.readlines()
        vectorized = vectorizer.transform(comments)
        predictions = clf_l.predict(vectorized)
        reg_streams.append(Stream(fname, comments, stream_label='not hype', predictions=predictions))


In [115]:
# calculate the states
threshold=10
for stream in hype_streams:
    setStreamStates(stream, threshold=threshold)
for stream in reg_streams:
    setStreamStates(stream, threshold=threshold)
    

# print results
print('**hype stream state results')
for stream in hype_streams:
    stream.printStateResults()

print()
print('**not hype stream state results')
for stream in reg_streams:
    stream.printStateResults()

**hype stream state results
hyp-TenZ-11-22.txt,hype,not hype,0.0,0
hyp-esl_csgo-11-22.txt,hype,hype,0.910828025477707,143
hyp-timthetatman-11-22.txt,hype,hype,0.9133858267716536,116
hyp-moistcr1tikal-11-15.txt,hype,hype,0.7578947368421053,144
hypestreams.txt,hype,not hype,0.0,0
hyp-GMHikaru-11-22.txt,hype,hype,0.9340277777777778,269
hyp-eslcsgo2-11-22.txt,hype,hype,0.9142857142857143,96
hyp-loltyler1-11-22.txt,hype,hype,0.8512396694214877,103

**not hype stream state results
reg-silentgarrett-11-21.txt,not hype,not hype,0.0,0
reg-gomisworld-11-21.txt,not hype,not hype,0.0,0
reg-aforestlife-11-21.txt,not hype,not hype,0.0,0
reg-keys-11-21.txt,not hype,not hype,0.0,0
reg-hologramdreams-11-21.txt,not hype,not hype,0.0,0
reg-coffeewithbee-11-21.txt,not hype,not hype,0.0,0
reg-sudarezz-11-20.txt,not hype,not hype,0.0,0


In [116]:
# print results
print('**hype stream prediction results')
for stream in hype_streams:
    stream.printPredictionResults()

print()
print('**not hype stream prediction results')
for stream in reg_streams:
    stream.printPredictionResults()

**hype stream prediction results
hyp-TenZ-11-22.txt,hype,hype,1.0,3
hyp-esl_csgo-11-22.txt,hype,hype,0.9554140127388535,150
hyp-timthetatman-11-22.txt,hype,hype,0.8582677165354331,109
hyp-moistcr1tikal-11-15.txt,hype,hype,0.8052631578947368,153
hypestreams.txt,hype,not hype,0.4666666666666667,7
hyp-GMHikaru-11-22.txt,hype,hype,0.8993055555555556,259
hyp-eslcsgo2-11-22.txt,hype,hype,0.9047619047619048,95
hyp-loltyler1-11-22.txt,hype,hype,0.9338842975206612,113

**not hype stream prediction results
reg-silentgarrett-11-21.txt,not hype,not hype,0.44680851063829785,21
reg-gomisworld-11-21.txt,not hype,not hype,0.16,12
reg-aforestlife-11-21.txt,not hype,not hype,0.24,30
reg-keys-11-21.txt,not hype,not hype,0.2846153846153846,37
reg-hologramdreams-11-21.txt,not hype,not hype,0.38953488372093026,67
reg-coffeewithbee-11-21.txt,not hype,not hype,0.28688524590163933,35
reg-sudarezz-11-20.txt,not hype,not hype,0.484375,31


In [117]:
for stream in hype_streams:
    if 'moist' in stream.name:
        print("time,hype count,state,prediction")
        for time in range(len(stream.states)):
            print(time, stream.hype_counts[time], stream.states[time], stream.predictions[time], sep=',')

time,hype count,state,prediction
0,1,not hype,hype
1,2,not hype,hype
2,1,not hype,not hype
3,0,not hype,not hype
4,0,not hype,not hype
5,1,not hype,hype
6,0,not hype,not hype
7,1,not hype,hype
8,2,not hype,hype
9,3,not hype,hype
10,4,not hype,hype
11,3,not hype,not hype
12,2,not hype,not hype
13,3,not hype,hype
14,2,not hype,not hype
15,3,not hype,hype
16,2,not hype,not hype
17,1,not hype,not hype
18,2,not hype,hype
19,1,not hype,not hype
20,2,not hype,hype
21,3,not hype,hype
22,4,not hype,hype
23,5,not hype,hype
24,4,not hype,not hype
25,3,not hype,not hype
26,4,not hype,hype
27,3,not hype,not hype
28,4,not hype,hype
29,3,not hype,not hype
30,2,not hype,not hype
31,3,not hype,hype
32,4,not hype,hype
33,5,not hype,hype
34,4,not hype,not hype
35,5,not hype,hype
36,4,not hype,not hype
37,5,not hype,hype
38,4,not hype,not hype
39,5,not hype,hype
40,6,not hype,hype
41,5,not hype,not hype
42,6,not hype,hype
43,7,not hype,hype
44,8,not hype,hype
45,9,not hype,hype
46,10,hype,hype
47,11,hype,

In [118]:
for stream in reg_streams:
    if 'garrett' in stream.name:
        print("time,hype count,state,prediction")
        for time in range(len(stream.states)):
            print(time, stream.hype_counts[time], stream.states[time], stream.predictions[time], sep=',')

time,hype count,state,prediction
0,0,not hype,not hype
1,0,not hype,not hype
2,0,not hype,not hype
3,0,not hype,not hype
4,1,not hype,hype
5,0,not hype,not hype
6,0,not hype,not hype
7,1,not hype,hype
8,2,not hype,hype
9,1,not hype,not hype
10,0,not hype,not hype
11,1,not hype,hype
12,2,not hype,hype
13,3,not hype,hype
14,2,not hype,not hype
15,1,not hype,not hype
16,2,not hype,hype
17,1,not hype,not hype
18,0,not hype,not hype
19,0,not hype,not hype
20,0,not hype,not hype
21,0,not hype,not hype
22,1,not hype,hype
23,2,not hype,hype
24,1,not hype,not hype
25,0,not hype,not hype
26,0,not hype,not hype
27,0,not hype,not hype
28,0,not hype,not hype
29,1,not hype,hype
30,0,not hype,not hype
31,1,not hype,hype
32,0,not hype,not hype
33,1,not hype,hype
34,0,not hype,not hype
35,1,not hype,hype
36,0,not hype,not hype
37,1,not hype,hype
38,2,not hype,hype
39,3,not hype,hype
40,4,not hype,hype
41,5,not hype,hype
42,6,not hype,hype
43,7,not hype,hype
44,6,not hype,not hype
45,5,not hype,not hype