In [133]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import re
import preprocessor as p
import enchant
from nltk.stem import PorterStemmer, WordNetLemmatizer
from __future__ import unicode_literals
from stemming.porter2 import stem
import ftfy
import itertools
from re import search
import csv
import pandas as pd
from tokenizers import twokenize
from postaggers import arktagger
from nltk import bigrams
from nltk import trigrams
from nltk.tokenize import TweetTokenizer
from clusters import Clusters

# Required:
#pip install nltk
#pip install stemming

## Filter/Cleaning

In [93]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def lower_tokenize(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

def wordcorrector(tweet):
    d = enchant.Dict("en_US")
    for word in tweet:
        if d.check(word) == False:
            try:
                word = d.suggest(word)[0]
            except IndexError:
                word = word
        else:
            continue
            
    return tweet

def lowered(tweet):
    for i in range(len(tweet)):
        tweet[i] = tweet[i].encode("utf-8").lower()
    return tweet

#stemming lemmitization
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()


### Feature Extractor \M/
## Bag of Words
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 


p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)

def filterer(input_data, pclean = 1, fixer = 1, whitespaces = 1, rm_numbers = 1, rm_punc = 1, corrector = 1, stemmer = 1, lemmatizer = 1, lowercase = 1):
    filtered_tweets = []
    for index, tweet in input_data[:40].iterrows():
        ## options
        if (pclean == 1):
            tweet = p.clean(tweet["tweet"])
        if (fixer == 1):
        ## fix words
            tweet = ftfy.fix_text(tweet)
        if (whitespaces == 1):
        ## remove whitespaces
            tweet = re.sub('[\s]+', ' ', tweet)
        if (rm_numbers == 1):
        ## remove numbers
            tweet = ''.join([i for i in tweet if not i.isdigit()])
        if (rm_punc == 1):
        ## fix punctuation
            tweet = "".join(c for c in tweet if c not in ("'",'\\','*',';','$','%','&','-','!','.',':','/','(',')','?','(',')',',','"'))
        ## tokenize
        tweet = word_tokenize(tweet)
        ## Word corrector
        if (corrector == 1):
            tweet = wordcorrector(tweet)
        ## stemmer
        if (stemmer == 1):
            tweet = [stem(token) for token in tweet]
        ## lemmatizer
        if (lemmatizer == 1):
            tweet = [lemmatiser.lemmatize(token, pos="v") for token in tweet]
        ## lower case
        if (lowercase == 1):
            tweet = lowered(tweet)
        tweet = " ".join(tweet).encode('utf-8')
        filtered_tweets.append(tweet)
    outputdata = filtered_tweets
    return outputd



## Morphological Features

In [102]:
tknzr = TweetTokenizer()

## Elongated Word
def has_long(sentence):
    elong = re.compile("([a-zA-Z])\\1{2,}")
    return bool(elong.search(sentence))

def elongated(inputtweets):
    elongated_words = []
    for tweet in inputtweets:
        if has_long(tweet) == True:
            elongated_words.append(1)
        else:
            elongated_words.append(0)
    return elongated_words

def numberOfElongatedWords(sentence):
    elong = re.compile("([a-zA-Z])\\1{2,}")
    return len([word for word in sentence.split() if elong.search(word)])

## Existence of time
def hastime(sentence):
    d1= re.compile(r"\d{1,2}:\d{1,2}.(AM|am|PM|pm|Pm|Am|a.m.|p.m.|A.M.|P.M.)+")
    d2= re.compile(r"\d{1,2}\.\d{1,2}.(AM|am|PM|pm|Pm|Am|a.m.|p.m.|A.M.|P.M.)+")
    d3= re.compile(r"\d{1,2}.(AM|am|PM|pm|Pm|Am|a.m.|p.m.|A.M.|P.M.)+")
    d4= re.compile(r"at \d{1,2} o'clock")
    if (bool(d1.search(sentence)) or bool(d2.search(sentence)) or bool(d3.search(sentence)) or bool(d4.search(sentence))):
        return 1
    else:
        return 0

## Exitence of date
def hasdate(sentence):
    d1 = re.compile(r"\d{1,2}/\d{1,2}/\d{4}")
    d2 = re.compile(r"\d{1,2}-\d{1,2}-\d{4}")
    d3 = re.compile(r"\d{1,2}.\d{1,2}.\d{4}")
    d4 = re.compile(r"\d{1,2}/\d{1,2}")
    d5 = re.compile(r"\d{1,2}-\d{1,2}")
    d6 = re.compile(r"\d{1,2}.\d{1,2}")
    d7 = re.compile(r"\d{1,2}(st|th|nd|rd)* of (Jan|jan|Feb|feb|Mar|mar|Apr|apr|May|may|June|june|July|july|Aug|aug|Sep|sep|Oct|oct|Nov|nov|Dec|dec)+")
    d8 = re.compile(r"(Monday|monday|Tuesday|tuesday|Wednesday|wednesday|Thursday|thursday|Friday|friday|Saturday|saturday|Sunday|sunday)")
    d9 = re.compile(r"(Jan|jan|Feb|feb|Mar|mar|Apr|apr|May|may|June|june|July|july|Aug|aug|Sep|sep|Oct|oct|Nov|nov|Dec|dec) \d{1,2}(st|th|nd|rd)*")
    if (bool(d1.search(sentence)) or bool(d2.search(sentence)) or bool(d3.search(sentence)) or bool(d4.search(sentence)) or bool(d5.search(sentence)) or bool(d6.search(sentence)) or bool(d7.search(sentence)) or bool(d8.search(sentence)) or bool(d9.search(sentence))):
        return 1
    else:
        return 0

def countFullyCapitalizeTokens(tokens):
    #return len([word for word in tokens if word=="<allcaps>"])
    return len([word for word in tokens if word.isupper()])

def countUpper(tokens):
    return len([word for word in tokens if word[0].isupper()])

def countexclama(message):
    return message.count("!")

def countques(tokens):
    x = 0
    for token in tokens:
        if token.count("?") == len(token):
            x+=1

    return x

def countdots(tokens):
    return len([word for word in tokens if word=="..."])

def hasslang(tokens,slangDictionary):
    for token in tokens:
        if token in slangDictionary:
            return 1

    return 0

In [82]:
data = pd.read_csv('twitter-2013train.txt', sep="	", header = None)
data1 = pd.read_csv('twitter-2015train.txt', sep="	", header = None)
data2 = pd.read_csv('twitter-2016train.txt', sep="	", header = None)

frames = [data,data1,data2]
result = pd.concat(frames,ignore_index=True)
result.columns = ["id", "polarity", "tweet"]

# result = result.drop_duplicates(["tweet"], keep='last')
rescol = result[:40]["tweet"]

filteredtweets = filterer(result, 1,1, 1,1,1,1,0,0,1)

## Metamorphical Feature Extractor

In [103]:
##### BEFORE FILTER
## Elongated
feat_elong = elongated(filteredtweets)
feat_elong_num = [numberOfElongatedWords(tweet) for tweet in rescol]

#Has time involved
has_time = [hastime(tweet) for tweet in rescol]

#has date
has_date = [hasdate(tweet) for tweet in rescol]

#capitalize num
count_capital = [countFullyCapitalizeTokens(tknzr.tokenize(tweet)) for tweet in rescol]

#capitalize upper first
count_uppernum = [countUpper(tknzr.tokenize(tweet)) for tweet in rescol]

#count ! marks
count_excla = [countexclama(tweet) for tweet in rescol]

#count ? marks
count_ques = [countques(tknzr.tokenize(tweet)) for tweet in rescol]

#count dots
count_dots = [countdots(tknzr.tokenize(tweet)) for tweet in rescol]

#has slang
slangdata = [line.split("=")[0] for line in open("lexicons/slangDict.txt", 'r')]
has_slang = [hasslang(tknzr.tokenize(tweet),slangdata) for tweet in rescol]


## Lexicon Feature Extractor

### CLuster

In [173]:
# Twitter Clusters
def checkClusters(tokens,clusters):  
    #initialize list with zeros
    tags = [0] * len(clusters.keys)

    c = []
    for token in tokens:
        c.append(clusters.d.get(token,"no_cluster"))

    c = [x for x in c if x!="no_cluster"] 

    for i in c:
        tags[clusters.keys.index(i)] = 1
    
    return tags

def loadClusters():
    return Clusters.Clusters()

clusters = loadClusters()
cluster_tags = []
for tweet in rescol:
    tokenized = tknzr.tokenize(tweet)
    cluster_tags.append(checkClusters(tokenized,clusters))

### Get Pos Tags

In [161]:
pos_tags_list = []
for tweet in rescol:
    pos_tags_list.append(arktagger.pos_tag_list(tweet)[0])


### POS grammatical

In [180]:
# Get Bigrams
def get_bi(l):
    b = []
    for x in l:
        b.append(list(bigrams(x)))

    return b

# Get Trigrams
def get_tri(l):
    tr = []
    for x in l:
        tr.append(list(trigrams(x)))

    return tr

def numberOfAdjectives(pos):
    return len([x for x in pos if x=="A"])

#calculate the number of adverbs
def numberOfAdverbs(pos):
    return len([x for x in pos if x=="R"])

#calculate the number of interjections
def numberOfIntejections(pos):
    return len([x for x in pos if x=="!"])

#calculate the number of verbs
def numberOfVerbs(pos):
    return len([x for x in pos if x=="V"])

#calculate the number of nouns
def numberOfNouns(pos):
    return len([x for x in pos if x=="N"])

#calculate the number of proper nouns
def numberOfProperNouns(pos,tokens):
    x = 0

    for i in range(0,len(pos)):
        try:
            #pos tagger wrongly tags these words as a proper noun
            if pos[i]=="^" and not(tokens[i]=="<user>" or tokens[i]=="<sadface>" or tokens[i]=="<smile>" or tokens[i]=="<url>"):
                x+=1
        except:
            pass

    return x
            

#calculate the number of urls
def numberOfUrls(pos,tokens):
    return (len([x for x in tokens if x=="<url>"]))

#calculate the number of subjective emoticons
def numberOfSubjectiveEmoticons(pos,tokens):
    return (len([x for x in tokens if (x=="<sadface>" or x=="<smile>")]))

#calculate the number of positive emoticons
def numberOfPositiveEmoticons(tokens):
   
    return len([x for x in tokens if x=="<smile>"])

#calculate the number of neutral emoticons
def numberOfNeutralEmoticons(tokens):
    
    return len([x for x in tokens if x=="<neutralface>"])

#calculate the number of negative emoticons
def numberOfNegativeEmoticons(tokens):
    return len([x for x in tokens if x=="<sadface>"])

num_adj = [numberOfAdjectives(tweet) for tweet in pos_tags_list]
num_adv = [numberOfAdverbs(tweet) for tweet in pos_tags_list]
num_int = [numberOfIntejections(tweet) for tweet in pos_tags_list]
num_ver = [numberOfVerbs(tweet) for tweet in pos_tags_list]
num_nou = [numberOfNouns(tweet) for tweet in pos_tags_list]

num_pno = []
for i in range(len(pos_tags_list)):
    num_pno.append(numberOfProperNouns(pos_tags_list[i],tknzr.tokenize(rescol[i])))

num_url = []
for i in range(len(pos_tags_list)):
    num_url.append(numberOfUrls(pos_tags_list[i],tknzr.tokenize(rescol[i])))


num_sem = []
for i in range(len(pos_tags_list)):
    num_sem.append(numberOfSubjectiveEmoticons(pos_tags_list[i],tknzr.tokenize(rescol[i])))

num_pem = [numberOfPositiveEmoticons(tweet) for tweet in pos_tags_list]
num_neu = [numberOfNeutralEmoticons(tweet) for tweet in pos_tags_list]
num_neg = [numberOfNegativeEmoticons(tweet) for tweet in pos_tags_list]


### Lexicon

## Feature Calc

In [195]:

#pos
pos_feat = []

for i in range(len(num_adj)):
    featposrow=[]
    sumof= num_adj[i] + num_adv[i] + num_int[i] + num_ver[i] + num_nou[i] + num_pno[i] + num_url[i] + num_sem[i] + num_pem[i] + num_neu[i] + num_neg[i]
    featposrow.append(2*(num_adj[i]/float(sumof))-1)
    featposrow.append(2*(num_adv[i]/float(sumof))-1)
    featposrow.append(2*(num_int[i]/float(sumof))-1)
    featposrow.append(2*(num_ver[i]/float(sumof))-1)
    featposrow.append(2*(num_nou[i]/float(sumof))-1)
    featposrow.append(2*(num_pno[i]/float(sumof))-1)
    featposrow.append(2*(num_url[i]/float(sumof))-1)
    featposrow.append(2*(num_sem[i]/float(sumof))-1)
    featposrow.append(2*(num_pem[i]/float(sumof))-1)
    featposrow.append(2*(num_neu[i]/float(sumof))-1)
    featposrow.append(2*(num_neg[i]/float(sumof))-1)
    pos_feat.append(featposrow)
 

## Submission File Creator

In [132]:
eng1 = [line.split(",")[0] for line in open("/home/nassar/Downloads/english_submission.csv", 'r')]
eng2 = [line.strip() for line in open("/home/nassar/aueb.twitter.sentiment/testres.csv", 'r')]
thelist = zip(eng1,["sentiment"]+eng2)
thefile = open('sub.csv', 'w')
for i in range(len(thelist)):
    thefile.write("%s\n" % ','.join([thelist[i][0],thelist[i][1]]))
thefile.close()

In [124]:
data_tupled_list = [(x[0],x[1]) for x in zip(*thelist)]


In [125]:
data_tupled_list

[(u'id', u'218775148495515649'), (u'sentiment', 'negative')]