Import Libraries & Data

In [49]:
import json
import pandas as pd
import re
import unicodedata as ud
from nltk import word_tokenize
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from scipy import sparse

import argparse

In [50]:

f = open('C:\\Users\\Jerem\\Documents\\Spring 2023\\LING-665\\FinalProject\\EXIST 2023 Dataset\\training\\EXIST2023_training.json', encoding = 'utf-8')
data = json.load(f)

Perform Preprocessing

In [51]:

def cleanTweet(tweet, lang):
    clean_tweet = re.sub("@[A-Za-z0-9_]+", '<USER>', tweet)
    clean_tweet = re.sub("#[A-Za-z0-9_]+","<HASH>", clean_tweet) 
    clean_tweet = re.sub("https[://\/-zA-Z0-9_=+.$@#$%^&*()]+", '<LINK>', clean_tweet) #Get rid of links
    clean_tweet = re.sub("[^\w\s]", '', clean_tweet)
    clean_tweet = re.sub(' +', ' ', clean_tweet)
    clean_tweet = clean_tweet.lower()
    
    if lang == 'es':
        clean_tweet = ' '.join(word_tokenize(clean_tweet, language='Spanish'))
        #print(tweet)
    elif lang == 'en':
        clean_tweet = ' '.join(word_tokenize(clean_tweet, language='English'))
        
    return clean_tweet




In [52]:
punct = '!\"#$%&\'()*+━, -./:;<=⁉>?@‼[\]🇽^_«»`{¡|}~…¿“”'

emoji_dict = {}
tweet_dict = {}
for x in data:
    
    lang = data[x]['lang']
    tweet = data[x]['tweet'].strip()
    labels = data[x]['labels_task1']
    
    emojis = [char for char in tweet if not char.isalpha() and char not in punct and not char.isdigit()]
    emojis = [(emoji, ud.name(emoji)) for emoji in emojis]
    for tuple in emojis:
        emoji = tuple[0]
        name = tuple[1]
        emoji_encoding = int(''.join([str(ord(c)) for c in name]))
        if emoji not in emoji_dict:
            emoji_dict[emoji] = emoji_encoding

    
    tweet = cleanTweet(tweet, lang).strip()
    

    item = (tweet, lang)
    # Check the labels
    yes = labels.count('YES')
    no = labels.count('NO')
    if yes == no:
        tweet_dict[item] = 1
    elif yes > 4:
        tweet_dict[item] = 1
    else:
        tweet_dict[item] = 0
    

print(emoji_dict)

{'🤷': 8372828571, '🏾': 697779747332777968737073698232707384908065848273677532848980694553, '\u200d': 9069827932877368847232747973786982, '♂': 776576693283737178, '️': 866582736584737978328369766967847982454954, '❤': 7269658689326676656775327269658284, '😊': 837773767378713270656769328773847232837773767378713269896983, '🖊': 767987698232766970843266657676807973788432806978, '😀': 71827378787378713270656769, '’': 827371728432837378717669328185798465847379783277658275, '👇': 87727384693268798778328079737884737871326665677572657868327378686988, '🏻': 6977797473327779687370736982327073849080658482736775328489806945494550, '👨': 776578, '🏽': 697779747332777968737073698232707384908065848273677532848980694552, '💻': 8069828379786576326779778085846982, '💀': 8375857676, '⚖': 836765766983, '📞': 846976698072797869328269676973866982, '‘': 7669708432837378717669328185798465847379783277658275, '🐼': 80657868653270656769, '🦁': 767379783270656769, '🦊': 7079883270656769, '🤍': 8772738469327269658284, '🔥': 707382

In [53]:
def binarize(count):
    return 0 if count == 0 else 1

Feature Engineering

In [54]:
eng_df = pd.DataFrame([{'tweet': tweet, "label":label, "char_len":len(tweet),"user":binarize(tweet.count('user')), "link":binarize(tweet.count('link'))} for (tweet, lang), label in tweet_dict.items() if lang == 'en'])
span_df = pd.DataFrame([{'tweet': tweet, "label":label, "char_len":len(tweet),"user":binarize(tweet.count('user')), "link":binarize(tweet.count('link'))} for (tweet, lang), label in tweet_dict.items() if lang == 'es'])

In [55]:
train, test = train_test_split(span_df, random_state=12, train_size=0.8)

#Get training tweets and labels into a list
X_train = train['tweet'].tolist()
Y_train = train['label'].tolist()

#Get training features
char_len_train = train['char_len'].tolist()
user_presence_train = train['user'].tolist()
link_presence_train = train['link'].tolist()


#Get test tweets and labels into a list
X_test = test['tweet'].tolist()
Y_test = test['label'].tolist()

#Get test features
char_len_test = test['char_len'].tolist()
user_presence_test = test['user'].tolist()
link_presence_test = test['link'].tolist()

In [56]:
# Initialize TF-IDF vectorizer with n-grams
tfidf = TfidfVectorizer(ngram_range=(2, 3), analyzer='char')


In [57]:
# fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)


# Add other features to TF-IDF matrix - train
print('Shape before adding features:', X_train_tfidf.shape)
X_train_tfidf = sparse.hstack((X_train_tfidf, np.array(char_len_train)[:,None], 
                                            np.array(user_presence_train)[:,None],
                                            np.array(link_presence_train)[:,None]))
print('Shape after adding features:', X_train_tfidf.shape)

Shape before adding features: (2912, 9295)
Shape after adding features: (2912, 9298)


In [58]:
# Fit and transform the test data
X_test_tfidf = tfidf.transform(X_test)

# Add other features to TF-IDF matrix - test
print('Shape before adding features:', X_test_tfidf.shape)
X_test_tfidf = sparse.hstack((X_test_tfidf, np.array(char_len_test)[:,None],
                                            np.array(user_presence_test)[:,None],
                                            np.array(link_presence_test)[:,None]))

print('Shape after adding features:', X_test_tfidf.shape)

Shape before adding features: (728, 9295)
Shape after adding features: (728, 9298)


Training the SVM

In [59]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 'scale'],
    'kernel': ['linear', 'rbf'],
    'degree': [2, 3],
    'coef0': [0.0, 0.5],
    'shrinking': [True, False],
    'probability': [True, False]
}

In [60]:
# instantiate the SVM model
svm_model = SVC()

# instantiate the GridSearchCV object with the SVM model and parameter grid
grid_search = GridSearchCV(svm_model, param_grid, cv=5)

# fit the GridSearchCV object to the training data
grid_search.fit(X_train_tfidf, Y_train)

# print the best parameters found by the grid search
print('Best parameters:', grid_search.best_params_)

# predict the test data using the best parameters
y_pred = grid_search.predict(X_test_tfidf)

# print the classification report
print(classification_report(Y_test, y_pred))