In [1]:
import string
from collections import defaultdict

import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from spacy.en import English


In [2]:
parser = English()
punctuations = string.punctuation

In [3]:
def prepare_labels():
    emoji = pd.read_csv('emoji.txt', sep="\n", header=None)
    emoji.columns = ["icons"]
    emoji.index += 1
    return emoji

In [4]:
emoji = prepare_labels()
emoji.icons.unique()

['heart_eyes' 'yum' 'sob' 'blush' 'weary' 'smirk' 'grin' 'flushed'
 'relaxed' 'wink']


In [5]:
def transform_labels(emoji):
    for i, name in ((0, 'heart_eyes'), (1, 'yum'), (2, 'sob'), (3, 'blush'), (4, 'weary'),
                    (5, 'smirk'), (6, 'grin'), (7, 'flushed'), (8, 'relaxed'), (9, 'wink')):
            emoji['icons'].replace(name, i, inplace=True)
    return emoji

In [None]:
emoji = transform_labels(emoji)
emoji.icons.unique()

In [6]:
def prepare_features():
    with open('tweets.txt') as f:
        tweets = f.readlines()
    return tweets

In [7]:
stoplist = set('for a of the and to in'.split())


def my_tokenizer(sentence):
    tokens = [word for word in sentence.lower().split() if word not in stoplist]
    # remove words that appear only once
    frequency = defaultdict(int)
    for token in tokens:
        frequency[token] += 1
    return [token for token in tokens if frequency[token] > 1]

In [8]:
# Create spacy tokenizer that parses a sentence and generates tokens
# these can also be replaced by word vectors
def spacy_tokenizer(sentence):
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations)]
    return tokens

In [9]:
# Basic utility function to clean the text
def clean_text(text):
    return text.strip().lower()


# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

In [10]:
emoji = prepare_labels()

tweets = prepare_features()

inputs_train, inputs_test, expected_output_train, expected_output_test = train_test_split(tweets, emoji)

In [11]:
vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1, 3))

classifier = LinearSVC()
classifier2 = SVC()
classifier3 = RandomForestClassifier()

In [12]:
# defining model
pipe = Pipeline([('cleaner', predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [13]:
# Create model
pipe.fit(inputs_train, expected_output_train)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('cleaner', <__main__.predictors object at 0x117708940>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [None]:
# now we can save it to a file
joblib.dump(pipe, 'model.pkl') 

In [None]:
# predict data
pred_data = pipe.predict(inputs_test)

In [None]:
# and measure accuracy
for (sample, pred) in zip(inputs_test, pred_data):
    print(sample, ">>>>>", pred)

print("Accuracy:", accuracy_score(expected_output_test, pred_data))