In [1]:
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
import re
import preprocessor as p
import enchant
from nltk.stem import PorterStemmer, WordNetLemmatizer
from __future__ import unicode_literals
from stemming.porter2 import stem
import ftfy
import itertools
from re import search
import csv
import pandas as pd
from tokenizers import twokenize
from postaggers import arktagger
from nltk import bigrams
from nltk import trigrams
from nltk.tokenize import TweetTokenizer
from clusters import Clusters
import numpy as np
import pickle
import os.path
from pathlib import Path
np.set_printoptions(threshold=np.nan)
from sklearn import svm

# Required:
#pip install nltk
#pip install stemming

## Filter/Cleaning

In [2]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def lower_tokenize(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

def wordcorrector(tweet):
    d = enchant.Dict("en_US")
    for word in tweet:
        if d.check(word) == False:
            try:
                word = d.suggest(word)[0]
            except IndexError:
                word = word
        else:
            continue
            
    return tweet

def lowered(tweet):
    for i in range(len(tweet)):
        tweet[i] = tweet[i].encode("utf-8").lower()
    return tweet

#stemming lemmitization
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()


### Feature Extractor \M/
## Bag of Words
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 


p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)

def filterer(input_data, pclean = 1, fixer = 1, whitespaces = 1, rm_numbers = 1, rm_punc = 1, corrector = 1, stemmer = 1, lemmatizer = 1, lowercase = 1):
    filtered_tweets = []
#     for index, tweet in input_data[:40].iterrows():
    for index, tweet in input_data.iterrows():
        ## options
        if (pclean == 1):
            tweet = p.clean(tweet["tweet"])
        if (fixer == 1):
        ## fix words
            tweet = ftfy.fix_text(tweet)
        if (whitespaces == 1):
        ## remove whitespaces
            tweet = re.sub('[\s]+', ' ', tweet)
        if (rm_numbers == 1):
        ## remove numbers
            tweet = ''.join([i for i in tweet if not i.isdigit()])
        if (rm_punc == 1):
        ## fix punctuation
            tweet = "".join(c for c in tweet if c not in ("'",'\\','*',';','$','%','&','-','!','.',':','/','(',')','?','(',')',',','"'))
        ## tokenize
        tweet = word_tokenize(tweet)
        ## Word corrector
        if (corrector == 1):
            tweet = wordcorrector(tweet)
        ## stemmer
        if (stemmer == 1):
            tweet = [stem(token) for token in tweet]
        ## lemmatizer
        if (lemmatizer == 1):
            tweet = [lemmatiser.lemmatize(token, pos="v") for token in tweet]
        ## lower case
        if (lowercase == 1):
            tweet = lowered(tweet)
        tweet = " ".join(tweet).encode('utf-8')
        filtered_tweets.append(tweet)
    outputdata = filtered_tweets
    return outputdata



## Morphological Features

In [3]:
tknzr = TweetTokenizer()

## Elongated Word
def has_long(sentence):
    elong = re.compile("([a-zA-Z])\\1{2,}")
    return bool(elong.search(sentence))

def elongated(intweet):
    state = 0
    if has_long(intweet) == True:
        state = 1
    else:
        state = 0
    return state

def numberOfElongatedWords(sentence):
    elong = re.compile("([a-zA-Z])\\1{2,}")
    return len([word for word in sentence.split() if elong.search(word)])

## Existence of time
def hastime(sentence):
    d1= re.compile(r"\d{1,2}:\d{1,2}.(AM|am|PM|pm|Pm|Am|a.m.|p.m.|A.M.|P.M.)+")
    d2= re.compile(r"\d{1,2}\.\d{1,2}.(AM|am|PM|pm|Pm|Am|a.m.|p.m.|A.M.|P.M.)+")
    d3= re.compile(r"\d{1,2}.(AM|am|PM|pm|Pm|Am|a.m.|p.m.|A.M.|P.M.)+")
    d4= re.compile(r"at \d{1,2} o'clock")
    if (bool(d1.search(sentence)) or bool(d2.search(sentence)) or bool(d3.search(sentence)) or bool(d4.search(sentence))):
        return 1
    else:
        return 0

## Exitence of date
def hasdate(sentence):
    d1 = re.compile(r"\d{1,2}/\d{1,2}/\d{4}")
    d2 = re.compile(r"\d{1,2}-\d{1,2}-\d{4}")
    d3 = re.compile(r"\d{1,2}.\d{1,2}.\d{4}")
    d4 = re.compile(r"\d{1,2}/\d{1,2}")
    d5 = re.compile(r"\d{1,2}-\d{1,2}")
    d6 = re.compile(r"\d{1,2}.\d{1,2}")
    d7 = re.compile(r"\d{1,2}(st|th|nd|rd)* of (Jan|jan|Feb|feb|Mar|mar|Apr|apr|May|may|June|june|July|july|Aug|aug|Sep|sep|Oct|oct|Nov|nov|Dec|dec)+")
    d8 = re.compile(r"(Monday|monday|Tuesday|tuesday|Wednesday|wednesday|Thursday|thursday|Friday|friday|Saturday|saturday|Sunday|sunday)")
    d9 = re.compile(r"(Jan|jan|Feb|feb|Mar|mar|Apr|apr|May|may|June|june|July|july|Aug|aug|Sep|sep|Oct|oct|Nov|nov|Dec|dec) \d{1,2}(st|th|nd|rd)*")
    if (bool(d1.search(sentence)) or bool(d2.search(sentence)) or bool(d3.search(sentence)) or bool(d4.search(sentence)) or bool(d5.search(sentence)) or bool(d6.search(sentence)) or bool(d7.search(sentence)) or bool(d8.search(sentence)) or bool(d9.search(sentence))):
        return 1
    else:
        return 0

def countFullyCapitalizeTokens(tokens):
    #return len([word for word in tokens if word=="<allcaps>"])
    return len([word for word in tokens if word.isupper()])

def countUpper(tokens):
    return len([word for word in tokens if word[0].isupper()])

def countexclama(message):
    return message.count("!")

def countques(tokens):
    x = 0
    for token in tokens:
        if token.count("?") == len(token):
            x+=1

    return x

def countdots(tokens):
    return len([word for word in tokens if word=="..."])

def hasslang(tokens,slangDictionary):
    for token in tokens:
        if token in slangDictionary:
            return 1

    return 0

In [6]:
result = pd.read_csv('new_english_test.csv', sep=",", header = None)
result.columns = ["id","tweet"]
filteredtweets = filterer(result, 1,1, 1,1,1,1,0,0,1)
rescol = result["tweet"]


## Metamorphical Feature Extractor

In [None]:
##### BEFORE FILTER
## Elongated
feat_elong = []
feat_elong_num = []
has_time = []
has_date = []
count_capital = []
count_uppernum =[]
count_excla = []
count_ques =[]
count_dots =[]
slangdata = []
has_slang = []

slangdata = [line.split("=")[0] for line in open("lexicons/slangDict.txt", 'r')]

for tweet in rescol:
    #elongated words
    feat_elong.append(elongated(tweet))
    feat_elong_num.append(numberOfElongatedWords(tweet))
    #Has time involved
    has_time.append(hastime(tweet))
    #has date
    has_date.append(hasdate(tweet))
    #capitalize num
    count_capital.append(countFullyCapitalizeTokens(tknzr.tokenize(tweet)))
    #capitalize upper first
    count_uppernum.append(countUpper(tknzr.tokenize(tweet)))
    #count ! marks
    count_excla.append(countexclama(tweet))
    #count ? marks
    count_ques.append(countques(tknzr.tokenize(tweet)))
    #count dots
    count_dots.append(countdots(tknzr.tokenize(tweet)))
    #has slang
    has_slang.append(hasslang(tknzr.tokenize(tweet),slangdata))

## Lexicon Feature Extractor

### CLuster

In [None]:
# Twitter Clusters
def checkClusters(tokens,clusters):  
    #initialize list with zeros
    tags = [0] * len(clusters.keys)

    c = []
    for token in tokens:
        c.append(clusters.d.get(token,"no_cluster"))

    c = [x for x in c if x!="no_cluster"] 

    for i in c:
        tags[clusters.keys.index(i)] = 1
    
    return tags

def loadClusters():
    return Clusters.Clusters()

clusters = loadClusters()
cluster_tags = []
for tweet in rescol:
    tokenized = tknzr.tokenize(tweet)
    cluster_tags.append(checkClusters(tokenized,clusters))

### Get Pos Tags

In [None]:
pos_tags_list = []

for tweet in filteredtweets:
    pos_tags_list.append(arktagger.pos_tag_list(tweet)[0])


### POS grammatical

In [None]:
# Get Bigrams
def get_bi(l):
    b = []
    for x in l:
        b.append(list(bigrams(x)))

    return b

# Get Trigrams
def get_tri(l):
    tr = []
    for x in l:
        tr.append(list(trigrams(x)))

    return tr

def numberOfAdjectives(pos):
    return len([x for x in pos if x=="A"])

#calculate the number of adverbs
def numberOfAdverbs(pos):
    return len([x for x in pos if x=="R"])

#calculate the number of interjections
def numberOfIntejections(pos):
    return len([x for x in pos if x=="!"])

#calculate the number of verbs
def numberOfVerbs(pos):
    return len([x for x in pos if x=="V"])

#calculate the number of nouns
def numberOfNouns(pos):
    return len([x for x in pos if x=="N"])

#calculate the number of proper nouns
def numberOfProperNouns(pos,tokens):
    x = 0

    for i in range(0,len(pos)):
        try:
            #pos tagger wrongly tags these words as a proper noun
            if pos[i]=="^" and not(tokens[i]=="<user>" or tokens[i]=="<sadface>" or tokens[i]=="<smile>" or tokens[i]=="<url>"):
                x+=1
        except:
            pass

    return x
            

#calculate the number of urls
def numberOfUrls(pos,tokens):
    return (len([x for x in tokens if x=="<url>"]))

#calculate the number of subjective emoticons
def numberOfSubjectiveEmoticons(pos,tokens):
    return (len([x for x in tokens if (x=="<sadface>" or x=="<smile>")]))

#calculate the number of positive emoticons
def numberOfPositiveEmoticons(tokens):
   
    return len([x for x in tokens if x=="<smile>"])

#calculate the number of neutral emoticons
def numberOfNeutralEmoticons(tokens):
    
    return len([x for x in tokens if x=="<neutralface>"])

#calculate the number of negative emoticons
def numberOfNegativeEmoticons(tokens):
    return len([x for x in tokens if x=="<sadface>"])

num_adj = []
num_adv = []
num_int = []
num_ver = []
num_nou = []
num_pno = []
num_url = []
num_sem = []
num_pem = []
num_neu = []
num_neg = []

print len(pos_tags_list)
lengthx = len(rescol)


# for i in rescol.index:
for i in range(len(filteredtweets)):
    num_adj.append(numberOfAdjectives(pos_tags_list[i]))
    num_adv.append(numberOfAdverbs(pos_tags_list[i]))
    num_int.append(numberOfIntejections(pos_tags_list[i]))
    num_ver.append(numberOfVerbs(pos_tags_list[i]))
    num_nou.append(numberOfNouns(pos_tags_list[i]))
    num_pem.append(numberOfPositiveEmoticons(pos_tags_list[i]))
    num_neu.append(numberOfNeutralEmoticons(pos_tags_list[i]))
    num_neg.append(numberOfNegativeEmoticons(pos_tags_list[i]))
    num_pno.append(numberOfProperNouns(pos_tags_list[i],tknzr.tokenize(filteredtweets[i])))
    num_url.append(numberOfUrls(pos_tags_list[i],tknzr.tokenize(filteredtweets[i])))
    num_sem.append(numberOfSubjectiveEmoticons(pos_tags_list[i],tknzr.tokenize(filteredtweets[i])))

In [None]:
print len(filteredtweets)

### Lexicon

In [None]:
## Negation Lexicon
def negationlex():
    path = "lexicons/negations.txt"
    negfile = open(path,"r")
    negList = []
    for line in negfile.readlines():
        line = line.decode('utf8')
        negList.append(line[0:len(line)-1])
    negfile.close()
    return negList

def isneg(tokens,neg_list):
    for token in tokens:
        if token in neg_list:
            return 1
    return 0

## NRC
nrc_sl = "lexicons/NRC/NRC-Hashtag-Sentiment-Lexicon-v0.1/unigrams-pmilexicon.txt"
nrc_md = "lexicons/NRC/MaxDiff-Twitter-Lexicon/Maxdiff-Twitter-Lexicon_-1to1.txt"
nrc_140 = "lexicons/NRC/Sentiment140AffLexNegLex/S140-AFFLEX-NEGLEX-unigrams.txt"

def nrc_lexicons(path):
    d_unigrams = {}
    f = open(path)
    for line in f.readlines():
        line = line.decode('utf8')
        try:
            key = line.split("\t")[0]
            value = line.split("\t")[1]
            d_unigrams[key]=float(value)
        except IndexError, e:
            print path
            print "liner", line
            continue
    f.close()
    return d_unigrams

## Feature Calc

In [None]:

#pos
f_num_adj = []
f_num_adv = []
f_num_int = []
f_num_ver = []
f_num_nou = []
f_num_pno = []
f_num_url = []
f_num_sem = []
f_num_pem = []
f_num_neu = []
f_num_neg = []

## calculate pos
for i in range(len(num_adj)):
    try:
        sumof= num_adj[i] + num_adv[i] + num_int[i] + num_ver[i] + num_nou[i] + num_pno[i] + num_url[i] + num_sem[i] + num_pem[i] + num_neu[i] + num_neg[i]
        f_num_adj.append(2*(num_adj[i]/float(sumof))-1)
        f_num_adv.append(2*(num_adv[i]/float(sumof))-1)
        f_num_int.append(2*(num_int[i]/float(sumof))-1)
        f_num_ver.append(2*(num_ver[i]/float(sumof))-1)
        f_num_nou.append(2*(num_nou[i]/float(sumof))-1)
        f_num_pno.append(2*(num_pno[i]/float(sumof))-1)
        f_num_url.append(2*(num_url[i]/float(sumof))-1)
        f_num_sem.append(2*(num_sem[i]/float(sumof))-1)
        f_num_pem.append(2*(num_pem[i]/float(sumof))-1)
        f_num_neu.append(2*(num_neu[i]/float(sumof))-1)
        f_num_neg.append(2*(num_neg[i]/float(sumof))-1)
    except ZeroDivisionError:
        print i
        f_num_adj.append(0)
        f_num_adv.append(0)
        f_num_int.append(0)
        f_num_ver.append(0)
        f_num_nou.append(0)
        f_num_pno.append(0)
        f_num_url.append(0)
        f_num_sem.append(0)
        f_num_pem.append(0)
        f_num_neu.append(0)
        f_num_neg.append(0)


## calculate negation
neglist = negationlex()

neglexfeat = []
for tweet in filteredtweets:
    neglexfeat.append(isneg(tknzr.tokenize(tweet),neglist))

## calculate NRC Lexicons

dict_nrc_sl = nrc_lexicons(nrc_sl)


## Zip features together

In [None]:
features = []
features = np.asarray(zip(feat_elong_num, has_time, has_date, count_capital, count_uppernum,
                          count_excla, count_ques, count_dots, has_slang,f_num_adj,f_num_adv,
                          f_num_int,f_num_ver,f_num_nou,f_num_pno,f_num_url,f_num_sem,f_num_pem,
                          f_num_neu,f_num_neg))

## Just adding cluster tags to the list of features
cluster_tags = np.asarray(cluster_tags)
features = np.concatenate((features, cluster_tags),axis=-1)
# features = np.column_stack((result["id"].values,result["polarity"].values,result["tweet"].values,features))


In [None]:
with open('model_nolexicon.pkl', 'rb') as handle:
    model = pickle.load(handle)


In [None]:
with open('model_nolexicon.pkl', 'wb') as output:
    pickle.dump(model, output, pickle.HIGHEST_PROTOCOL)
print "Model saved..."


#### Submission File Creator

In [None]:
eng1 = [line.split(",")[0] for line in open("/home/nassar/Downloads/english_submission.csv", 'r')]
eng2 = [line.strip() for line in open("/home/nassar/aueb.twitter.sentiment/testres.csv", 'r')]
thelist = zip(eng1,["sentiment"]+eng2)
thefile = open('sub.csv', 'w')
for i in range(len(thelist)):
    thefile.write("%s\n" % ','.join([thelist[i][0],thelist[i][1]]))
thefile.close()

In [None]:
data_tupled_list = [(x[0],x[1]) for x in zip(*thelist)]


In [None]:
data_tupled_list