-
Notifications
You must be signed in to change notification settings - Fork 0
/
message_analysis.py
167 lines (113 loc) · 4.4 KB
/
message_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import dateutil
import train_classifier
import nltk
import re
import time
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import seaborn as sns; sns.set()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
def open_file(filename):
f = open(filename, 'r')
y = f.read()
content = y.splitlines()
return content
def ismessage(string):
# ditctionary of regex patterns, to find message parts corresponding to these keys.
patterns = {
"date" :'([0-9]{2}/){2}[0-9]{4}',
"time" :'[0-9]{2}:[0-9]{2}',
"name" :' - .*?:',
"message" :'[a-z]: .*$'
}
message_dict = {}
for pattern_key in patterns:
result = re.search(patterns[pattern_key], string)
if result:
message_dict[pattern_key] = result.group()
return message_dict
def process(content):
j = 1
df = pd.DataFrame(index = range(1, len(content)+1), columns=[ 'Name', 'Message', 'date_string'])
for i in content:
results = ismessage(i)
# if the message contains a name, time, date, and content string
# we add it to the data frame
if len(results) == 4:
df.iloc[j]['Name'] = results['name'][3:-1]
df.iloc[j]['Message'] = results['message'][3:]
df.iloc[j]['date_string'] = results['date'] + ' ' + results['time']
j += 1
df = df[pd.notnull(df['Message'])] # remove null messages (if any)
# retrieve date, day, hour, from date string
df['Date'] = pd.to_datetime(df['date_string'], format='%d/%m/%Y %H:%M')
df['Day'] = df['date_string'].map(lambda x: dateutil.parser.parse(x).strftime("%a"))
df['Hour'] = df['date_string'].map(lambda x:dateutil.parser.parse(x).strftime("%H"))
df.index = df['Date']
return df
def make_plots(df):
fig = plt.figure()
plt.title = "Whatsappening"
ax1 = plt.subplot2grid((4,6), (0,0), rowspan=2, colspan=2)
ax2 = plt.subplot2grid((4,6), (0,2), rowspan=2, colspan=2)
ax3 = plt.subplot2grid((4,6), (0,4), rowspan=2, colspan=2)
ax4 = plt.subplot2grid((4,6), (2,0), rowspan=2, colspan=6)
plt.tight_layout()
df.groupby('Hour').count().plot(ax=ax1, legend=None, xlim=[0,23])
df.groupby('Day').count().plot(y="Message", ax=ax2, kind='bar', legend=None)
df.Name.value_counts().plot(ax=ax3,kind = 'bar')
df.groupby(df.index.date).count().plot(y="Message", ax=ax4, legend=None)
plt.show()
def clean_to_classify(message):
# remove links, images (not included), punctuation for classification
patterns = ['http.*?(\s|$)', '\<Media omitted\>', '\p.*?(\s|$)', '[^\w\s\d!]+']
for pattern in patterns:
cleaned_text = re.sub(pattern, ' ', message)
# lower case everything, remove any short words, split into an array.
cleaned_text = [e.lower() for e in re.findall("[\w']+|!", cleaned_text) if(len(e) >= 3 or e == '!')]
return cleaned_text
def classify_messages(df):
cleaned_text = []
for message in df.Message:
# get text in classification format
cleaned_text.append(clean_to_classify(message))
# read classifier from pickle
f = open('tweet_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()
sentiment = []
for text in cleaned_text:
# get the sentiment of each message (using the feature extrator we trained with)
# and add it to a list
sentiment.append(classifier.classify(train_classifier.extract_features(text)))
df['sentiment'] = sentiment # put sentiments into the data frame
return df
def vader_sentiment(df):
sith = SentimentIntensityAnalyzer()
sentiment = []
for sentence in df.Message:
sent = sith.polarity_scores(sentence)
#sent_total = sent['pos'] - sent['neg']
sentiment.append(sent['compound'])
df['sentiment'] = sentiment
return df
def plot_sentiment(df):
# plot the sentiments
fig = plt.figure()
ax = plt.gca()
ax.set_yticks([-1,0,1])
ax.set_yticklabels(['Negative','Neutral','Positive'])
df.groupby(df.index.date).mean().plot(y='sentiment', ylim=[-1.5,1.5], legend=None, ax=ax)
plt.show()
if __name__ == '__main__':
filename = 'whatsapp_mess.txt'
content = open_file(filename)
processed_df = process(content)
make_plots(processed_df)
# d2 = processed_df.copy()
data_panel = vader_sentiment(processed_df)
plot_sentiment(data_panel)
# data_panel2 = classify_messages(d2)
# right = 100*sum(data_panel.sentiment*data_panel2.sentiment >= 0) / float(len(data_panel))
# print 'The Naive Bayes and Vader agree on the polarity of %.2f%% of classifications' %right