In [127]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS
import nltk
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import train_test_split
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import spacy
nlp1 = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS



In [142]:

training_dataset ="/Users/manis/Dropbox/research/2021/lrev/notebooks/train_scc.csv"
training_data = pd.read_csv(training_dataset)




In [135]:

print("starting setup")
ekphrasis_process = TextPreProcessor(
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True, 
    segmenter="twitter", 
    corrector="twitter", 
    unpack_hashtags=True,  
    unpack_contractions=True, 
    spell_correct_elong=False, 
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

print("finished setup")


def ekphrasis_word(word):

    return(" ".join(ekphrasis_process.pre_process_doc(word)))


def get_ekphrasis(word):
    import re
    b = ekphrasis_word(word)
    TAG_RE = re.compile(r'<[^>]+>')
    res = TAG_RE.sub('',b)
    res = res.rstrip().lstrip()
    return res    


def html_prepr(text):
    get_text = re.compile('<.*?>')
    res = re.sub(get_text, ' ', str(text))
    return res


def alphanum_cl(text): 
    processed = re.sub(r'[?|!|\'|"|#]',r'',text)
    processed = re.sub(r'[.|,|)|(|\|/]',r' ',processed)
    processed = processed.strip()
    processed = processed.replace("\n"," ")
    return processed


def alphanum_pr(text):
    res_word = ""
    for word in text.split():
        get_word = re.sub('[^a-z A-Z]+', ' ', word)
        res_word += get_word
        res_word += " "
    res_word = res_word.strip()
    return res_word

def spacy_lemmatizer(text):
    doc = nlp1(text)
    return (" ".join([token.lemma_ for token in doc]))

training_data['tweet_txt'] = training_data['tweet_txt'].str.lower()
training_data['tweet_txt'] = training_data['tweet_txt'].apply(html_prepr)
training_data['tweet_txt'] = training_data['tweet_txt'].apply(alphanum_cl)
training_data['tweet_txt'] = training_data['tweet_txt'].apply(alphanum_pr)
training_data['tweet_txt'] = training_data['tweet_txt'].apply(get_ekphrasis)
training_data['tweet_txt'] = training_data['tweet_txt'].apply(spacy_lemmatizer)


starting setup
Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
finished setup


In [136]:
get_stopwords = set(stopwords.words('english'))
get_stopwords.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
re_get_stopwords = re.compile(r"\b(" + "|".join(get_stopwords) + ")\\W", re.I)
def delsw(sentence):
    global re_get_stopwords
    return re_get_stopwords.sub(" ", sentence)

training_data['tweet_txt'] = training_data['tweet_txt'].apply(removeStopWords)


In [137]:
ps  = SnowballStemmer("english")
def stemming(sentence):
    stemres = ""
    for word in sentence.split():
        stem = ps.stem(word)
        stemres += stem
        stemres += " "
    stemres = stemres.strip()
    return stemres

# training_data['tweet_txt'] = training_data['tweet_txt'].apply(stemming)


In [140]:
train, test = train_test_split(training_data, random_state=42, test_size=0.20, shuffle=True)
traintext = train['tweet_txt']
testtext = test['tweet_txt']
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(traintext)
vectorizer.fit(testtext)
x_train = vectorizer.transform(traintext)
y_train = train.drop(labels = ['tweet_txt'], axis=1)
x_test = vectorizer.transform(testtext)
y_test = test.drop(labels = ['tweet_txt'], axis=1)


In [141]:

classifier = ClassifierChain(LogisticRegression())
classifier.fit(x_train, y_train)
classifier_predict = classifier.predict(x_test)
