In [1]:
%config IPCompleter.greedy=True
%load_ext memory_profiler
%matplotlib inline
%run DSF_FINAL_PROJECT_Helper.ipynb
%run DSF_FINAL_PROJECT_Helper_General.ipynb
%run DSF_FINAL_PROJECT_Plot.ipynb

### Description

Using our models together

<h3 style="background-color:powderblue;">Loading Pre-stored data</h3>

In [2]:
#Connecting to sqlite3 database
dbConn = Sqlite3Db('social_network.db')

In [3]:
dbConn.query('''SELECT tweet_vector, rate FROM TweetsPrepEmb;''')

sqlStmTPERes = dbConn.cursor.fetchall()
#print(type(sqlStmRes))
#print(sqlStmRes)
df_Tweets_PrepEmb = pd.DataFrame(sqlStmTPERes, columns =['tweet_vector', 'rate'])

In [4]:
dbConn.query('''SELECT words, sentiment_coeff FROM TweetsSentAnalysis; ''')

sqlStmSARes = dbConn.cursor.fetchall()
#print(type(sqlStmRes))
#print(sqlStmRes)
sentiment_map = pd.DataFrame(sqlStmSARes, columns =['words', 'sentiment_coeff'])

In [5]:
#Creating a setiment dictionary {'word','rate'}
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [6]:
#sentiment_dict

<h3 style="background-color:powderblue;">Normalizing the Bow - TF-IDF</h3>

In [7]:
#Now, we are getting tfidf scores of words in every sentence. Then, we're going to replace those scores with their associated tfidf weights
df_Tweets_PrepEmb_weighting = df_Tweets_PrepEmb.copy()

In [8]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(df_Tweets_PrepEmb_weighting.tweet_vector)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(df_Tweets_PrepEmb_weighting.tweet_vector)



In [9]:
#initializing Sentiment Analysis class
sentimentAnalysis = SentimentAnalysis(version = "1.0")

In [10]:
%%time
replaced_tfidf_scores = df_Tweets_PrepEmb_weighting.apply(lambda x: sentimentAnalysis.replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

CPU times: user 208 ms, sys: 939 µs, total: 209 ms
Wall time: 202 ms


In [11]:
#Replacing words in sentences with their sentiment score
replaced_closeness_scores = df_Tweets_PrepEmb_weighting.tweet_vector.apply(lambda x: list(map(lambda y: sentimentAnalysis.replace_sentiment_words(y, sentiment_dict), x.split())))

<h3 style="background-color:powderblue;">Using previous models (including all the steps) and predicting</h3>

In [12]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, df_Tweets_PrepEmb_weighting.tweet_vector, df_Tweets_PrepEmb_weighting.rate]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [1 if i==1 else 0 for i in replacement_df.sentiment]

<h3 style="background-color:powderblue;">Simple metrics</h3>

In [13]:
predicted_classes = replacement_df.prediction
y_test = replacement_df.sentiment

conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print_formatted_text(HTML('<b>Performance of a classification model - Confusion Matrix</b>'))
display(conf_matrix)



[0m[?7h[0;1mPerformance of a classification model - Confusion Matrix[0m
[0m

Unnamed: 0,0,1
0,114,43
1,146,133


In [14]:
test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print_formatted_text(HTML('<b>Main Scores</b>'))
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

[0m[?7h[0;1mMain Scores[0m
[0m

Unnamed: 0,scores
accuracy,0.566514
precision,0.755682
recall,0.476703
f1,0.584615
