# Common part

Natural Language Toolkit dictionaries downloading

In [1]:
import nltk

nltk.download('words')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package words to /Users/simon/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /Users/simon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/simon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/simon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas

def load_map(file_name):
    result = pandas.read_csv(f'data/{file_name}.csv')
    return { str(row["key"]): str(row["value"]) for i, row in result.iterrows() }

In [3]:
import re

regex_dict = {
    'URL': r"""(?xi)\b(?:(?:https?|ftp|file):\/\/|www\.|ftp\.|pic\.|twitter\.|facebook\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:;,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:;,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])""",
    'EMOJI': u'([\U0001F1E0-\U0001F1FF])|([\U0001F300-\U0001F5FF])|([\U0001F600-\U0001F64F])|([\U0001F680-\U0001F6FF])|([\U0001F700-\U0001F77F])|([\U0001F800-\U0001F8FF])|([\U0001F900-\U0001F9FF])|([\U0001FA00-\U0001FA6F])|([\U0001FA70-\U0001FAFF])|([\U00002702-\U000027B0])|([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])',
    'HASHTAG': r"\#\b[\w\-\_]+\b",
    'EMAIL': r"(?:^|(?<=[^\w@.)]))(?:[\w+-](?:\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(?:\.(?:[a-z]{2,})){1,3}(?:$|(?=\b))",
    'MENTION': r"@[A-Za-z0-9]+",
    'CASHTAG': r"(?:[$\u20ac\u00a3\u00a2]\d+(?:[\\.,']\d+)?(?:[MmKkBb](?:n|(?:il(?:lion)?))?)?)|(?:\d+(?:[\\.,']\\d+)?[$\u20ac\u00a3\u00a2])",
    'DATE': r"(?:(?:(?:(?:(?<!:)\b\'?\d{1,4},? ?)?\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b))|(?:(?:(?<!:)\b\\'?\d{1,4},? ?)\b(?:[Jj]an(?:uary)?|[Ff]eb(?:ruary)?|[Mm]ar(?:ch)?|[Aa]pr(?:il)?|May|[Jj]un(?:e)?|[Jj]ul(?:y)?|[Aa]ug(?:ust)?|[Ss]ept?(?:ember)?|[Oo]ct(?:ober)?|[Nn]ov(?:ember)?|[Dd]ec(?:ember)?)\b(?:(?:,? ?\'?)?\d{1,4}(?:st|nd|rd|n?th)?\b(?:[,\\/]? ?\'?\d{2,4}[a-zA-Z]*)?(?: ?- ?\d{2,4}[a-zA-Z]*)?(?!:\d{1,4})\b)?))|(?:\b(?<!\d\\.)(?:(?:(?:[0123]?[0-9][\\.\\-\\/])?[0123]?[0-9][\\.\\-\\/][12][0-9]{3})|(?:[0123]?[0-9][\\.\\-\\/][0123]?[0-9][\\.\\-\\/][12]?[0-9]{2,3}))(?!\.\d)\b))",
    'TIME': r'(?:(?:\d+)?\\.?\d+(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))|(?:(?:[0-2]?[0-9]|[2][0-3]):(?:[0-5][0-9])(?::(?:[0-5][0-9]))?(?: ?(?:AM|PM|am|pm|a\\.m\\.|p\\.m\\.))?)',
    'EMPHASIS': r"(?:\*\b\w+\b\*)",
    'ELONG': r"\b[A-Za-z]*([a-zA-Z])\1\1[A-Za-z]*\b",
    'PUNCTUATION': r"[\-\"`@#$%^&*(|)/~\[\]{\}!:;+,._='?]+"
}

regex = { k: re.compile(regex_dict[k]) for k, v in regex_dict.items() }

In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

emnlp = load_map("emnlp_dict")
contraction_mapping = load_map("contraction_mapping")
emoticons = load_map("emoticons")

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=2)
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), min_df=2)

In [6]:
import unicodedata

def normalize_string(string):
    string = string.lower()

    for word, word_replace in contraction_mapping.items():
        string = string.replace(word, word_replace)

    for key, reg in regex.items():
        string = reg.sub(f" {key} ", string)

    string = unicodedata.normalize('NFKD', string).encode('ascii', errors='ignore').decode('utf8', errors='ignore')
    string = word_tokenize(string)
    string = [emnlp[word] if str(word) in emnlp else word for word in string]
    string = [lemmatizer.lemmatize(word) for word in string if not word in stop_words]
    string = ' '.join(string)
    return string
    

# Organization prediction

- The ability to predict organization (e.g. Apple) given a Tweet

In [7]:
from sklearn.pipeline import FeatureUnion, Pipeline

company_vectorizer = Pipeline([
    ('feats', FeatureUnion(
        [
            ('word_ngram', word_vectorizer),
            ('char_ngram', char_vectorizer)
        ]
    ))
])

In [8]:
from nltk import word_tokenize

def normalize_string_for_company(string):
    string = string.lower()
    string = re.sub(r'(@|#)(apple|microsoft|windows|xbox|office|google|twitter|android)', r'\2', string)

    for word in emoticons.keys():
        string = string.replace(word, " ")

    return normalize_string(string)

In [9]:
test_df = pandas.read_csv("data/test.csv")
train_df = pandas.read_csv("data/train.csv")

train_cleaned = train_df['TweetText'].apply(normalize_string_for_company)
test_cleaned = test_df['TweetText'].apply(normalize_string_for_company)

company_vectorizer = company_vectorizer.fit(train_cleaned)

x_train = company_vectorizer.transform(train_cleaned)
x_test = company_vectorizer.transform(test_cleaned)

y_train = train_df['Topic']
y_test = test_df['Topic']

In [10]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

svc = LinearSVC()
svc.fit(x_train,y_train)

pred_company = svc.predict(x_test)
print(classification_report(y_test,pred_company))

              precision    recall  f1-score   support

       apple       0.95      0.96      0.95        98
      google       0.85      0.80      0.82        79
   microsoft       0.81      0.73      0.77        78
     twitter       0.75      0.85      0.80        87

    accuracy                           0.84       342
   macro avg       0.84      0.83      0.84       342
weighted avg       0.84      0.84      0.84       342



# Sentiment Analysis

In [11]:
sentiment_vectorizer = Pipeline([
    ('feats', FeatureUnion(
        [
            ('word_ngram', word_vectorizer),
            ('char_ngram', char_vectorizer)
        ]
    ))
])

In [12]:
from nltk import word_tokenize

def normalize_string_for_sentiment(string):
    for word, word_replace in emoticons.items():
        string = string.replace(word, word_replace[1:-1])

    return normalize_string(string)

In [13]:
import scipy.sparse
import numpy as np

def transform_topic(v, name, value=0.1):
    return [value if x == name else 0 for x in v]

test_df = pandas.read_csv("data/test.csv")
test_df = test_df[test_df["Sentiment"] != "irrelevant"]

train_df = pandas.read_csv("data/train.csv")
train_df = train_df[train_df["Sentiment"] != "irrelevant"]

train_cleaned = train_df['TweetText'].apply(normalize_string_for_company)
test_cleaned = test_df['TweetText'].apply(normalize_string_for_company)

sentiment_vectorizer = sentiment_vectorizer.fit(train_cleaned)

x_train = sentiment_vectorizer.transform(train_cleaned)
x_train = scipy.sparse.hstack((
    x_train, 
    np.asmatrix(transform_topic(train_df['Topic'].values, "positive")).transpose(),
    np.asmatrix(transform_topic(train_df['Topic'].values, "neutral")).transpose(),
    np.asmatrix(transform_topic(train_df['Topic'].values, "negative")).transpose()
))

x_test = sentiment_vectorizer.transform(test_cleaned)
x_test = scipy.sparse.hstack((
    x_test, 
    np.asmatrix(transform_topic(test_df['Topic'].values, "positive")).transpose(),
    np.asmatrix(transform_topic(test_df['Topic'].values, "neutral")).transpose(),
    np.asmatrix(transform_topic(test_df['Topic'].values, "negative")).transpose()
))

y_train_named = train_df['Sentiment']
y_test = test_df['Sentiment']

- Expand to more than 3 sentiment classes (say in 5-point scale)

In [14]:
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.linear_model import LinearRegression, SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix

def divide_into_groups(x):
    negative = x[0]
    neutral = x[1]
    positive = x[2]
    
    if neutral >= negative and neutral >= positive:
        if neutral - positive < 0.3:
            return "semi-positive"
        if neutral - negative < 0.3:
            return "semi-negative"
        else:
            return "neutral"
    if negative >= neutral and negative >= positive:
        if negative < 0:
            return "semi-negative"
        else:
            return "negative"
    if positive >= negative and positive >= neutral:
        if positive < 0:
            return "semi-positive"
        else:
            return "positive"

svc = LinearSVC()
svc.fit(x_train,y_train_named)
pred_svc = svc.predict(x_test)

print(classification_report(pred_svc, y_test))

              precision    recall  f1-score   support

    negative       0.49      0.63      0.55        38
     neutral       0.88      0.80      0.84       173
    positive       0.53      0.65      0.59        26

    accuracy                           0.76       237
   macro avg       0.64      0.69      0.66       237
weighted avg       0.78      0.76      0.77       237

