## Loading Data from data.zip file

In [1]:
from data import c3d
train, validation, test = c3d.load_data()

#load_data returns the data after splitting it into train, validation and test

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5384/5384 [00:03<00:00, 1559.01it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6493/6493 [00:04<00:00, 1555.82it/s]


In [2]:
train_tweets, train_labels = train.features, train.labels

In [3]:
print(f"{train.num_examples} training samples")
print(f"{validation.num_examples} validation samples")
print(f"{test.num_examples} testing samples")

6680 training samples
2227 validation samples
2970 testing samples


## Data Preprocessing

In [5]:
from data import c3d
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from ekphrasis.classes.tokenizer import SocialTokenizer

from nltk.tokenize import TweetTokenizer
from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.segmenter import Segmenter
import re
from ekphrasis.classes.preprocessor import TextPreProcessor
import emoji
from tqdm import trange 
import collections, numpy

In [6]:
train_tweets, train_labels = train.features, train.labels
val_tweets, val_labels = validation.features, validation.labels
test_tweets, test_labels = test.features, test.labels

In [7]:
collections.Counter(train_labels),collections.Counter(val_labels),collections.Counter(test_labels) # Data is balanced

(Counter({0: 6680}), Counter({0: 2227}), Counter({0: 2970}))

### Tokenization

In [9]:
sentence = train_tweets[0]
print(sentence)
words = word_tokenize(sentence)
print(words)

@FahadHarthi1 Nothing like a coffee in the winter.
['@', 'FahadHarthi1', 'Nothing', 'like', 'a', 'coffee', 'in', 'the', 'winter', '.']


### Converting to lower case

In [10]:
sentence = sentence.lower()
print(sentence)

@fahadharthi1 nothing like a coffee in the winter.


### Loading stop words

In [12]:
stop_words = set(stopwords.words('english')) 
print(stop_words)

{'himself', 'all', 'theirs', "haven't", 'from', 'a', 'shan', "isn't", 'such', 'ours', 'y', 'now', "don't", 'above', 'through', 'or', 're', "you're", 'ma', 'being', 'there', 's', 'having', 'hasn', "needn't", 'up', 'why', 'only', 'those', 'at', 'weren', 'can', 'themselves', 'were', 'shouldn', 've', "couldn't", 'your', 'ain', 'than', 'between', "wouldn't", 'i', "shouldn't", 'is', 'am', 'wouldn', 'not', 'll', "shan't", 'she', 'with', 'if', 'doing', 'aren', 'itself', 'what', 'him', 'until', 'mightn', "doesn't", "weren't", 'how', 'while', 'are', 'once', 'that', 'out', 'some', 'its', "she's", 'do', 'no', 'own', 'o', 'after', 'yourselves', 'below', 'during', 'the', 'won', 'but', 'he', 'by', "mightn't", 'ourselves', 'any', 'our', 'whom', 'has', 'couldn', 'didn', 'had', 'down', 'into', 'herself', 'was', 'under', 'hers', 'then', 'myself', 'her', 'in', 'more', 'these', 'they', 'so', 'on', 'will', 'did', "it's", 'hadn', 'needn', 'wasn', 'it', 'does', 'doesn', 'further', 'again', "that'll", "didn't"

In [13]:
sentence = train_tweets[0]
tknzr = TweetTokenizer()
social_tokenizer = SocialTokenizer(lowercase=True).tokenize

print("Tweet Tokenizer:",tknzr.tokenize(sentence))
print()
print("Social tokenizer:",social_tokenizer(sentence))

Tweet Tokenizer: ['@FahadHarthi1', 'Nothing', 'like', 'a', 'coffee', 'in', 'the', 'winter', '.']

Social tokenizer: ['@fahadharthi1', 'nothing', 'like', 'a', 'coffee', 'in', 'the', 'winter', '.']


  self.tok = re.compile(r"({})".format("|".join(pipeline)))


### Stemming

In [14]:
ps = PorterStemmer()
for word in sentence.split():
    print(ps.stem(word))

@fahadharthi1
noth
like
a
coffe
in
the
winter.


### Lemmatization

In [16]:
lemmatizer = WordNetLemmatizer()
for word in sentence.split():
    print(lemmatizer.lemmatize(word, pos='v'))

@FahadHarthi1
Nothing
like
a
coffee
in
the
winter.


In [19]:
tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", train_tweets[9]).split())
tweet

'@Dani_TwiHard I told everyone, I have been diagnosed with "Atypical Depression". You can google it, that explains most dumb stuff I say =('

### Loading spell checker

In [20]:
sp = SpellCorrector(corpus="english")

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading english - 1grams ...
generating cache file for faster loading...
reading ngrams C:\Users\nithy\.ekphrasis\stats\english\counts_1grams.txt


## Data Cleaning

In [21]:
words_to_delete = ['theirs', 'she', 'of', 
                  'all', 'her', 'ourselves', 'that', 'some', 'your', 
                  'what', 'or', 'me',  'now', 'after',
                  'until', 'them', 'through', 'who', 'herself', 'he', 
                   'y', 'each', 'under', 'hers', 'other', 'down', 
                  'this', 'their', 'as', 'on','few', 'which', 'further', 
                  'whom', 'its', 'so', 'yourselves', 'because', 'it', 'both', 'in', 'nor', 
                    'yours', 'yourself', 'before','since', 
                  'there', 'himself', 'then', 
                  'him', 'over',  'here',  'an',  'into','next','d','u','r','im','m','have', 
                  'the', 'again','such', 'myself', 'they', 
                  'we', 'those', 'between', 'once','even','have'
                   'how', 'from',  'ours', 'during','be','ama','r','i','do','but',
                  'his', 'against', 'below',  'to', 'about', 
                   'by', 'i', 'where', 'a', 'very', 'our', 'my', 'for', 'and','ur'
                  'while', 'only', 'up', 'these', 'just', 'same','how',
                  'you', 'themselves', 'above', 'with',  'than', 
                  'own', 'out', 'when', 'any', 'too', 'o', 'at']
def load_dict_contractions():
    
    return {
        "ain't":"is not","amn't":"am not","aren't":"are not","can't":"cannot","'cause":"because","couldn't":"could not",
        "couldn't've":"could not have","could've":"could have","daren't":"dare not","daresn't":"dare not","dasn't":"dare not","didn't":"did not","doesn't":"does not",
        "don't":"do not","e'er":"ever","em":"them","everyone's":"everyone is","finna":"fixing to",
        "gimme":"give me","gonna":"going to", "gon't":"go not","gotta":"got to",  "hadn't":"had not", "hasn't":"has not","haven't":"have not",
        "he'd":"he would", "he'll":"he will","he's":"he is", "he've":"he have","how'd":"how would",
        "how'll":"how will", "how're":"how are","how's":"how is", "I'd":"I would", "I'll":"I will", "I'm":"I am",
        "I'm'a":"I am about to","I'm'o":"I am going to","isn't":"is not","it'd":"it would",
        "it'll":"it will", "it's":"it is","I've":"I have","kinda":"kind of","let's":"let us","mayn't":"may not",
        "may've":"may have","mightn't":"might not","might've":"might have", "mustn't":"must not","mustn't've":"must not have",
        "must've":"must have","needn't":"need not", "ne'er":"never", "o'":"of", "o'er":"over","ol'":"old",
        "oughtn't":"ought not","shalln't":"shall not","shan't":"shall not","she'd":"she would","she'll":"she will","she's":"she is",
        "shouldn't":"should not","shouldn't've":"should not have","should've":"should have","somebody's":"somebody is",
        "someone's":"someone is","something's":"something is","that'd":"that would","that'll":"that will",
        "that're":"that are","that's":"that is","there'd":"there would","there'll":"there will", "there're":"there are","there's":"there is", "these're":"these are",
        "they'd":"they would","they'll":"they will","they're":"they are", "they've":"they have","this's":"this is",
        "those're":"those are","'tis":"it is","'twas":"it was","wanna":"want to","wasn't":"was not","we'd":"we would",
        "we'd've":"we would have","we'll":"we will","we're":"we are","weren't":"were not","we've":"we have",
        "what'd":"what did","what'll":"what will","what're":"what are","what's":"what is",
        "what've":"what have","when's":"when is","where'd":"where did","where're":"where are","where's":"where is",
        "where've":"where have","which's":"which is","who'd":"who would","who'd've":"who would have","who'll":"who will",
        "who're":"who are","who's":"who is","who've":"who have","why'd":"why did","why're":"why are","why's":"why is",
        "won't":"will not","wouldn't":"would not","would've":"would have","y'all":"you all","you'd":"you would",
        "you'll":"you will","you're":"you are","you've":"you have","Whatcha":"What are you","luv":"love","sux":"sucks"
        ,"shes":"she is"}

In [22]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
               'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated",
              'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    segmenter="twitter",
    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="english",
    unpack_hashtags=False,  # perform word segmentation on hashtags
    unpack_contractions=True, 
    spell_correction=True,
    tokenizer=SocialTokenizer(lowercase=False).tokenize
)

Reading english - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [23]:
def clean_tweet(tweet,processor=text_processor):
    tweet=emoji.demojize(tweet)
    tweet = tweet.lower()

    tweet = tweet.replace("’","'")
    tweet = tweet.split()
    contractions=load_dict_contractions()
    tweet = [contractions[word] if word in contractions else word for word in tweet]
    tweet = " ".join(tweet) 
    tweet=" ".join(text_processor.pre_process_doc(tweet))

    # remove punctuations
    tweet = re.sub(u'[{}]'.format('!"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~'), u'',tweet)
    # Lemmatizer
    lemmatizer = WordNetLemmatizer()
    s=''
    for word in tweet.split():
        s=s+" "+lemmatizer.lemmatize(word, pos='v')
     # unuseful words removal
    for w in words_to_delete:
        pattern = r'\b'+w+r'\b'
        s = re.sub(pattern, '', s)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', s)
    return tweet

In [24]:
def clean_tweets (tweets):
    clean_all_tweets=[]
    for i in trange(len(tweets)):
         clean_all_tweets.append(clean_tweet(tweets[i]))
    return(clean_all_tweets)

In [27]:
Clean_train_tweets=clean_tweets(train_tweets)

for i in range(15):
    print()
    print(train_tweets[i])
    print(Clean_train_tweets[i])
    print()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6680/6680 [00:06<00:00, 1101.18it/s]


@FahadHarthi1 Nothing like a coffee in the winter.
 user nothing like coffee winter


I am Monty's Sally Cinnamon, isn't that right @DanMonty11 ☺️
 monty s sally cinnamon not right user smiling_face


Thorn Breeder! I just bred a Thorn Dragon in DragonVale! Visit my park to check it out! https://t.co/i3dG1ohjOM
 thorn breeder breed thorn dragon dragonvale visit park check url


I'm diagnosed with depression. I was like, the kush will take care of this sh*t
 diagnose depression like kush will take care sht censor


Seguim a @camiamic un projecte comunitari d'educació en valors cívics i ambientals a l'Esquerra de l'Eixample i Sant Antoni
 seguim user un projecte comunitari educació en valors cívics ambientals l esquerra de l eixample sant antoni


I feel so childish for being up this late like I don't have stuff to do TODAY 🙄
 feel childish late like not stuff today face_with_rolling_eyes


@BeamingBeautyx I have been diagnosed with Depression, Bipolar Disorder, and Borderline Personali


