In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

from tweet_cleaner import TweetCleaner

In [2]:
TRAIN_PATH, TEST_PATH = 'data/Train.csv', 'data/Test.csv'

In [3]:
cleaner = TweetCleaner()

train_df = pd.read_csv(TRAIN_PATH)
train_df = train_df[train_df['Sentiment'] != 'irrelevant']
train_df['cleaned'] = train_df['TweetText'].apply(cleaner.clean)

test_df = pd.read_csv(TEST_PATH)
test_df = test_df[test_df['Sentiment'] != 'irrelevant']
test_df['cleaned'] = test_df['TweetText'].apply(cleaner.clean)

## Results (Organizations)

In [4]:
word_vectorizer = text.TfidfVectorizer(
    analyzer='word', ngram_range=(1, 4),
    min_df=2, use_idf=True, sublinear_tf=True)
char_vectorizer = text.TfidfVectorizer(
    analyzer='char', ngram_range=(3, 5),
    min_df=2, use_idf=True, sublinear_tf=True)
ngrams_vectorizer = Pipeline([
    ('feats',
     FeatureUnion([('word_ngram', word_vectorizer),
                   ('char_ngram', char_vectorizer),
                   ]))])

train_feature = ngrams_vectorizer.fit_transform(train_df['cleaned'])
test_feature = ngrams_vectorizer.transform(test_df['cleaned'])

org_cls = LinearSVC()
org_cls.fit(train_feature, train_df['Topic'])
report = classification_report(test_df['Topic'], org_cls.predict(test_feature))
print(report)

              precision    recall  f1-score   support

       apple       0.83      0.90      0.87        93
      google       0.76      0.65      0.70        48
   microsoft       0.78      0.65      0.70        48
     twitter       0.64      0.73      0.68        48

    accuracy                           0.76       237
   macro avg       0.75      0.73      0.74       237
weighted avg       0.77      0.76      0.76       237



## Results (Sentiment)

In [5]:
ORGANIZATION_TO_FEATURE = {'apple': 0.1, 'google': 0.2, 'microsoft': 0.3, 'twitter': 0.4}

organizations = []
for _, row in train_df.iterrows():
    organizations.append(ORGANIZATION_TO_FEATURE[row['Topic']])
train_feature = pd.DataFrame(train_feature.todense()).join(pd.DataFrame({'Topic': organizations}))

organizations = []
for _, row in test_df.iterrows():
    organizations.append(ORGANIZATION_TO_FEATURE[row['Topic']])
test_feature = pd.DataFrame(test_feature.todense()).join(pd.DataFrame({'Topic': organizations}))

sent_cls = LinearSVC()
sent_cls.fit(train_feature, train_df['Sentiment'])
report = classification_report(test_df['Sentiment'], sent_cls.predict(test_feature))
print(report)

              precision    recall  f1-score   support

    negative       0.73      0.49      0.59        49
     neutral       0.81      0.92      0.86       156
    positive       0.67      0.56      0.61        32

    accuracy                           0.78       237
   macro avg       0.74      0.66      0.69       237
weighted avg       0.78      0.78      0.77       237



## Test it!

In [6]:
tweet = 'Apple is your choice'
organization = 'apple'

feature = ngrams_vectorizer.transform([cleaner.clean(tweet)])
print(org_cls.predict(feature)[0])

feature = pd.DataFrame(feature.todense()).join(pd.DataFrame({'Topic': [ORGANIZATION_TO_FEATURE[organization]]}))
print(sent_cls.predict(feature)[0])

apple
neutral
