In [21]:
#import nltk
#import re
import pandas as pd
import numpy as np
#from sklearn.feature_extraction import text
#from options import *
from sklearn.preprocessing import StandardScaler

In [22]:
def convert_to_lowercase(tweets):
    """
    Function: 
             Converts tweets into lowercases
    Input: 
            tweets as strings
    Output: 
            tweets as lowercases 
    """
    return tweets.str.lower()

In [23]:
def filter_some_punctuation(tweets):
    """
    Function:
            deletes some useless punctuation
    Input:
            tweets as strings
    Output:
            tweets with filtered punctuation
    """
    
    useless_punct=['.', '?', '@']
    
    for punct in useless_punct:
        tweets.str.replace(punct, '', case=False)
    
    return tweets

In [24]:
def filter_money(tweets):
    """
    Function:
            assign tag to money symbols
    Input:
            tweets as python
    Output:
            money filtered tweets
    """
    
    money_symbols=['$', 'CHF', '€']
    
    for symbol in money_symbols:
        tweets.str.replace(symbol, '<money>', case=False)
    
    return tweets

In [25]:
def filter_user(tweets):
    """
    Function: 
            Replaces the word '<user>' by an empty string.
    Input: 
            tweets 
    Output: 
            filtered <user>tweets
    """
    return tweets.str.replace('<user>', '', case=False)


In [26]:
def filter_url(tweets):
    """
    Function: 
            Replaces the word '<url>' by an empty string.
    Input: 
            tweets as strings
    Output: 
            filtered <url>tweets
    """
    return tweets.str.replace('<url>', '', case=False)

In [27]:
def expand_not(tweets):
    """
    Function: 
            Replaces contractions of words into the formal form. In other terms, it expands the contractions. 
            For e.g, "i'm" will be expanded to "i am".
    Input: 
            tweets as strings
    Output:
            tweets with contractions expanded
    """
    tweets = tweets.str.replace('n\'t', ' not', case=False)
    tweets = tweets.str.replace('i\'m', 'i am', case=False)
    tweets = tweets.str.replace('\'re', ' are', case=False)
    tweets = tweets.str.replace('it\'s', 'it is', case=False)
    tweets = tweets.str.replace('that\'s', 'that is', case=False)
    tweets = tweets.str.replace('\'ll', ' will', case=False)
    tweets = tweets.str.replace('\'l', ' will', case=False)
    tweets = tweets.str.replace('\'ve', ' have', case=False)
    tweets = tweets.str.replace('\'d', ' would', case=False)
    tweets = tweets.str.replace('he\'s', 'he is', case=False)
    tweets = tweets.str.replace('what\'s', 'what is', case=False)
    tweets = tweets.str.replace('who\'s', 'who is', case=False)
    tweets = tweets.str.replace('\'s', '', case=False)

    return tweets

In [28]:
def emoji_transformation(tweet):
    """
    Function: 
            replaces emoticons/smileys by tags. For e.g, <3 will be replaced by <heart>
    Input: 
            tweet as string
    Output: 
            transformed tweet as string
    """

    #Possible emoticons_Construction:
    hearts = ["<3", "♥"]
    eyes = ["8",":","=",";"]
    nose = ["'","`","-",r"\\"]
    smiley = []
    sadfaces = []
    neutralfaces = []
    funnyfaces = []

    for e in eyes:
        for n in nose:
            for s in ["\)", "d", "]", "}"]:
                smiley.append(e+n+s)
                smiley.append(e+s)
            for s in ["\(", "\[", "{"]:
                sadfaces.append(e+n+s)
                sadfaces.append(e+s)
            for s in ["\|", "\/", r"\\"]:
                neutralfaces.append(e+n+s)
                neutralfaces.append(e+s)
            #emoji in other sense (e.g, :-) can also be found as (-: )
            for s in ["\(", "\[", "{"]:
                smiley.append(s+n+e)
                smiley.append(s+e)
            for s in ["\)", "\]", "}"]:
                sadfaces.append(s+n+e)
                sadfaces.append(s+e)
            for s in ["\|", "\/", r"\\"]:
                neutralfaces.append(s+n+e)
                neutralfaces.append(s+e) 
            funnyfaces.append(e+n+"p")
            funnyfaces.append(e+"p")

    smiley = set(smiley)
    sadfaces = set(sadfaces)
    neutralfaces = set(neutralfaces)
    funnyfaces = set(funnyfaces)
    
    t = []
    for w in tweet.split():
        if(w in hearts):
            t.append("<heart>")
        elif(w in smiley):
            t.append("<smile>")
        elif(w in funnyfaces):
            t.append("<funnyface>")
        elif(w in neutralfaces):
            t.append("<neutralface>")
        elif(w in sadfaces):
            t.append("<sadface>")
        else:
            t.append(w)
    return (" ".join(t)).strip()

In [29]:
def preprocess(data_X) :
    """
    Function:
            for each line in data X, there may be different sizes of features, so normalization is needed for the 
            further methods
    """
    
    x=data_X.copy()
    x=np.array(x)
    

    if x.shape[0] > 1 : 
        x = np.array([ StandardScaler().fit_transform(line) for line in x ])
    else :
        x = StandardScaler().fit_transform(x)
   
    return x