In [1]:
pip install emot #library for manipulating  emojis and emoticons



In [2]:
import nltk
nltk.download('stopwords')
import pandas as pd
import emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
import re
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#used for expanding apostrophicwords like don't to do not
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [4]:
def clean_text(text):
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text,  
                  flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    #  remove stop words
    
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if (not w in stops) and w!="not"] #remove stopwords except "not"
    text = " ".join(text)

    return text

In [5]:
#code to covert emoji and emoticons to their meaning texts.

# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
        return text

        
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
        return text

In [6]:
df_train=pd.read_csv("/content/drive/My Drive/minor/english_dataset/english_dataset.tsv",sep='\t')

In [7]:
df_train.head()

Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_en_1,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT,NONE,NONE
1,hasoc_en_2,@politico No. We should remember very clearly ...,HOF,HATE,TIN
2,hasoc_en_3,@cricketworldcup Guess who would be the winner...,NOT,NONE,NONE
3,hasoc_en_4,Corbyn is too politically intellectual for #Bo...,NOT,NONE,NONE
4,hasoc_en_5,All the best to #TeamIndia for another swimmin...,NOT,NONE,NONE


In [8]:
df_test=pd.read_csv("/content/drive/My Drive/minor/english_dataset/hasoc2019_en_test-2919.tsv",sep='\t')

In [9]:
df_test.head()

Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_en_902,West Bengal Doctor Crisis: Protesting doctors ...,NOT,NONE,NONE
1,hasoc_en_416,68.5 million people have been forced to leave ...,NOT,NONE,NONE
2,hasoc_en_207,"You came, you saw .... we will look after the ...",NOT,NONE,NONE
3,hasoc_en_595,We'll get Brexit delivered by October 31st. ...,NOT,NONE,NONE
4,hasoc_en_568,Fuck you. Go back to the dark ages you cow @IB...,HOF,PRFN,UNT


In [10]:
df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)
print("Text cleaning complete.")
df_train['text'] = df_train['text'].apply(convert_emoticons)
df_test['text'] = df_test['text'].apply(convert_emoticons)
print("Emoticons cleaning complete.")
df_train['text'] = df_train['text'].apply(convert_emojis)
df_test['text'] = df_test['text'].apply(convert_emojis)
print("Emoji cleaning complete.")

Text cleaning complete.
Emoticons cleaning complete.
Emoji cleaning complete.


In [11]:
df_train.head()

Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_en_1,dhonikeepstheglove watch sports minister kiren...,NOT,NONE,NONE
1,hasoc_en_2,politico remember clearly individual1 admitted...,HOF,HATE,TIN
2,hasoc_en_3,cricketworldcup guess would winner cwc19 team ...,NOT,NONE,NONE
3,hasoc_en_4,corbyn politically intellectual borisjohnsonsh...,NOT,NONE,NONE
4,hasoc_en_5,best teamindia another swimming competition su...,NOT,NONE,NONE


In [12]:
df_test.head()

Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_en_902,west bengal doctor crisis protesting doctors a...,NOT,NONE,NONE
1,hasoc_en_416,68 5 million people forced leave homes read,NOT,NONE,NONE
2,hasoc_en_207,came saw look fort good luck,NOT,NONE,NONE
3,hasoc_en_595,get brexit delivered october 31st help build m...,NOT,NONE,NONE
4,hasoc_en_568,fuck go back dark ages cow ibnliverealtime rap...,HOF,PRFN,UNT


In [13]:
df_train.to_csv("/content/drive/My Drive/minor/english_dataset/english_dataset_prepro.csv",index=False)
df_test.to_csv("/content/drive/My Drive/minor/english_dataset/hasoc2019_en_test-2919_prepro.csv",index=False)