In [0]:
# MODULES IMPORT 

import numpy as np
import pandas as pd
import pdb
import os

In [7]:
# GOOGLE COLAB SETUP

# Load the Drive helper and mount
if os.name=='posix':
    from google.colab import drive

    # This will prompt for authorization.
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# VARIABLES AND PATHS
if os.name=='posix':
    raw_data_path = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/raw_data'
    data_path     = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/data'
    codes_path    = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/AOBDL_data_preparation'
else:
    raw_data_path = '../raw_data'
    data_path     = '../data'
    codes_path    = '../AOBDL_data_preparation'


#3. Read file as panda dataframe
df = pd.read_csv(f'{raw_data_path}/labeled_data_twitter_davidson.csv', index_col=0) 

In [0]:
# Create target Directory if don't exist
if not os.path.exists(data_path):
    os.mkdir(data_path)
    print("Directory " , data_path ,  " Created ")

In [0]:
df['mal'] = 0
df.loc[df['class']==0,'mal'] = 1
df.loc[df['class']==1,'mal'] = 1

In [0]:
# Clean data completely 
def substitute_repeats_fixed_len(text, nchars, ntimes=3):
    return re.sub(r"(\S{{{}}})(\1{{{},}})".format(nchars, ntimes-1), r"\1", text)
  
def substitute_repeats(text, ntimes=3):
    for nchars in range(1, 20):
        text         = substitute_repeats_fixed_len(text, nchars, ntimes)
    return text

def text_to_wordlist(text, remove_stop_words=True, stem_words=False, with_punct_sent=False):
    stop_words       = set(['a', 'the', "an", "are", "as",  'did',
                       "do", "is", "has", "have", "had", "was", "were",
                       "will", "would", "am", "it", "for", "on", "it", "of"])
    #from string import punctuation
    import re
    NEG_CONTRACTIONS    = [
                          (r'aren\'t', 'are not'),
                          (r'can\'t', 'can not'),
                          (r'couldn\'t', 'could not'),
                          (r'daren\'t', 'dare not'),
                          (r'didn\'t', 'did not'),
                          (r'doesn\'t', 'does not'),
                          (r'don\'t', 'do not'),
                          (r'isn\'t', 'is not'),
                          (r'hasn\'t', 'has not'),
                          (r'haven\'t', 'have not'),
                          (r'hadn\'t', 'had not'),
                          (r'mayn\'t', 'may not'),
                          (r'mightn\'t', 'might not'),
                          (r'mustn\'t', 'must not'),
                          (r'needn\'t', 'need not'),
                          (r'oughtn\'t', 'ought not'),
                          (r'shan\'t', 'shall not'),
                          (r'shouldn\'t', 'should not'),
                          (r'wasn\'t', 'was not'),
                          (r'weren\'t', 'were not'),
                          (r'won\'t', 'will not'),
                          (r'wouldn\'t', 'would not'),
                          (r'ain\'t', 'am not') # not only but stopword anyway
                          ]
    OTHER_CONTRACTIONS = [
                          #(r"'m", 'am'),
                          (r"'ll", ' will'),
                          (r"'s", ' has'), # or 'is' but both are stopwords
                          (r"'d", ' had'), # or 'would' but both are stopwords
                          (r"'ve", " have"),
                           (r"'re", " are")   
    ]
    OTHER_RPS          = [
                          ("&lt;3", " good "),
                          (":d", " good "),
                          (":dd", " good "),
                          (":p", " good "),
                          ("8)", " good "),
                          (":-)", " good "),
                          (":)", " good "),
                          (";)", " good "),
                          ("(-:", " good "),
                          ("(:", " good "),
                          ("yay!", " good "),
                          ("yay", " good "),
                          ("yaay", " good "),
                          ("yaaay", " good "),
                          ("yaaaay", " good "),
                          ("yaaaaay", " good "),
                          (":/", " bad "),
                          (":&gt;", " sad "),
                          (":')", " sad "),
                          (":-(", " bad "),
                          (":(", " bad "),
                          (":s", " bad "),
                          (":-s", " bad "),
                          ("&lt;3", " heart "),
                          (":d", " smile "),
                          (":p", " smile "),
                          (":dd", " smile "),
                          ("8)", " smile "),
                          (":-)", " smile "),
                          (":)", " smile "),
                          (";)", " smile "),
                          ("(-:", " smile "),
                          ("(:", " smile "),
                          (":/", " worry "),
                          (":&gt;", " angry "),
                          (":')", " sad "),
                          (":-(", " sad "),
                          (":(", " sad "),
                          (":s", " sad "),
                          (":-s", " sad "),
                          (r"\br\b", "are"),
                          (r"\bu\b", "you"),
                          (r"\bhaha\b", "ha"),
                          (r"\bhahaha\b", "ha"),
                          (r"\bdon't\b", "do not"),
                          (r"\bdoesn't\b", "does not"),
                          (r"\bdidn't\b", "did not"),
                          (r"\bhasn't\b", "has not"),
                          (r"\bhaven't\b", "have not"),
                          (r"\bhadn't\b", "had not"),
                          (r"\bwon't\b", "will not"),
                          (r"\bwouldn't\b", "would not"),
                          (r"\bcan't\b", "can not"),
                          (r"\bcannot\b", "can not"),
                          (r"\bi'm\b", "i am"),
                          ("m", "am"),
                          ("r", "are"),
                          ("u", "you"),
                          ("haha", "ha"),
                          ("hahaha", "ha"),
                          ("don't", "do not"),
                          ("doesn't", "does not"),
                          ("didn't", "did not"),
                          ("hasn't", "has not"),
                          ("haven't", "have not"),
                          ("hadn't", "had not"),
                          ("won't", "will not"),
                          ("wouldn't", "would not"),
                          ("can't", "can not"),
                          ("cannot", "can not"),
                          ("i'm", "i am"),
                          ("m", "am"),
                          ("i'll" , "i will"),
                          ("its" , "it is"),
                          ("it's" , "it is"),
                          ("'s" , " is"),
                          ("that's" , "that is"),
                          ("weren't" , "were not")
    ]
    
    # Clean the text, with the option to remove stop_words and to stem words.
    text                = text.lower()
    
    for t in NEG_CONTRACTIONS:
            text        = re.sub(t[0], t[1], text)


    for t in OTHER_CONTRACTIONS:
            text        = re.sub(t[0], t[1], text)
    for t in OTHER_RPS:
            #print(t)
            text.replace(t[0], t[1])
            #text = re.sub(t[0], t[1], text)    
    
    # Clean the text
    if with_punct_sent:
      text              = re.sub(r"[^A-Za-z0-9!.?]", " ", text)
    else: 
      text              = re.sub(r"[^A-Za-z0-9]", " ", text)
    
    text                = re.sub(r"what's", "", text)
    #text               = re.sub(r"What's", "", text)
    text                = re.sub(r"\'s", " ", text)
    text                = re.sub(r"\'ve", " have ", text)
    text                = re.sub(r"can\'t", "cannot ", text)
    text                = re.sub(r"n\'t", " not ", text)
    text                = re.sub(r"i\'m", "i am", text)
    text                = re.sub(r" m ", " am ", text)
    text                = re.sub(r"\'re", " are ", text)
    text                = re.sub(r"\'d", " would ", text)
    text                = re.sub(r"\'ll", " will ", text)
    text                = re.sub('[0-9]+', '', text)
   
    
    
    #if with_punct_sent==False:
    #    pass
        #text = ''.join([c for c in text if c not in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'])
    #else: 
     #   text = ''.join([c for c in text if c not in '!.?'])
        
    # Optionally, remove stop words
    if remove_stop_words:
        text           = text.split()
        text           = [w for w in text if not w in stop_words]
        text           = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text           = text.split()
        stemmer        = SnowballStemmer('english')
        stemmed_words  = [stemmer.stem(word) for word in text]
        text           = " ".join(stemmed_words)
   
    ltr = text.split()
    new_data = []
    for i in ltr:
        arr = str(i).split()
        xx = ""
        for j in arr:
            j = str(j).lower()
            if j[:4] == 'http' or j[:3] == 'www':
                continue
            xx += j + ' '
        new_data.append(xx)
    text = ''.join(new_data)
    return(text)

In [19]:
print('Preparing the data without any punctuation')
df_no_punct = df.copy()

print('Train')
df_no_punct.loc[:,'tweet'] = df['tweet'].apply(lambda x : text_to_wordlist(x, remove_stop_words=True, stem_words=False))

Preparing the data without any punctuation
Train


In [0]:
df_no_punct.to_csv(f'{data_path}/twitter_cleaned_no_punkt.csv', index=False)

In [22]:
print('Preparing the data with sentence punctuation')
df_sent_punct = df.copy()

print('Train')
df_sent_punct.loc[:,'tweet'] = df['tweet'].apply(lambda x : text_to_wordlist(x, remove_stop_words=True, stem_words=False, with_punct_sent=True))

Preparing the data with sentence punctuation
Train


In [0]:
df_sent_punct.to_csv(f'{data_path}/twitter_cleaned_sent_punkt.csv', index=False)