## This notebook is used to generate the finalized version of the classifier, to simply feature transformation into the final form, and to test that the results are the same

Most of the code comes from operational_classifier.

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.externals import joblib
# from joblib import *
#reload(sys)
#sys.setdefaultencoding("utf-8")

from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.stem import WordNetLemmatizer 
  
nltk.download('punkt')    
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()


#Loading raw data
df = pd.read_csv("data/labeled_data.csv")
tweets = df.tweet


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stephengriggs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephengriggs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
len(tweets)

24783

## Feature generation

In [3]:


stopwords=stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)

stemmer = PorterStemmer()

## Original Version

# def preprocess(text_string):
#     """
#     Accepts a text string and replaces:
#     1) urls with URLHERE
#     2) lots of whitespace with one instance
#     3) mentions with MENTIONHERE

#     This allows us to get standardized counts of urls and mentions
#     Without caring about specific people mentioned
#     """
#     space_pattern = '\s+'
#     giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
#         '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
#     mention_regex = '@[\w\-]+'
#     parsed_text = re.sub(space_pattern, ' ', text_string)
#     parsed_text = re.sub(giant_url_regex, '', parsed_text)
#     parsed_text = re.sub(mention_regex, '', parsed_text)
#     #parsed_text = parsed_text.code("utf-8", errors='ignore')
#     return parsed_text


## Omari's version
def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
       '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-\:]+' #<<<<<<added the semicolon after the + to remove : at end of Rt's
    emoji_regex = '&#[0-9\;\:]+'    #<<<<<<<<<remove emoji's .ex; &#1214324
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    parsed_text = re.sub(emoji_regex,'',parsed_text)
    parsed_text = parsed_text.strip(string.punctuation)
    return parsed_text








## Original Version 
# def tokenize(tweet):
#     """Removes punctuation & excess whitespace, sets to lowercase,
#     and stems tweets. Returns a list of stemmed tokens."""
#     tweet = " ".join(re.split("[^a-zA-Z]*", tweet.lower())).strip()
#     #tokens = re.split("[^a-zA-Z]*", tweet.lower())
#     tokens = [stemmer.stem(t) for t in tweet.split()]
#     return tokens



## Omari's Version used for initial models
def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split('\s|(?<!\d)[,.]|[,.](?!\d)', tweet.lower())).strip()
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens


## Kelly's version
# def tokenize(tweet):
#     tokens = []
#     # remove non-alphabetic characters
#     tweet_text = tweet_text = re.sub("[^a-zA-Z]"," ", str(tweet))
#     #remove html content
#     tweet_text = BeautifulSoup(tweet_text).get_text()
#     # tokenize
#     words = word_tokenize(tweet_text.lower())
#     # lemmatize each word to its lemma
#     lemma_words = [lemmatizer.lemmatize(i) for i in words]
#     tokens.append(lemma_words)
#     return(tokens[0])


def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.501
    )

In [4]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores
tfidf.shape

  'stop_words.' % sorted(inconsistent))


(24783, 7271)

In [5]:
vocab

{'!': 0,
 '! @@@@': 1,
 '! @@@@ top': 2,
 '!!': 3,
 '!!!': 4,
 '!!!!': 5,
 '"': 6,
 '" "': 7,
 '" -': 8,
 '" bitch': 9,
 '" damn': 10,
 '" fuck': 11,
 '" got': 12,
 '" hoe': 13,
 '" i\'m': 14,
 '" like': 15,
 '" lmao': 16,
 '" love': 17,
 '" nigga': 18,
 '" pussi': 19,
 '" swear': 20,
 '" thi': 21,
 '" thi bitch': 22,
 '"a': 23,
 '"all': 24,
 '"bad': 25,
 '"bad bitch"': 26,
 '"bad bitches"': 27,
 '"be': 28,
 '"bitch': 29,
 '"bitch"': 30,
 '"black': 31,
 '"come': 32,
 '"damn': 33,
 '"do': 34,
 '"don\'t': 35,
 '"fuck': 36,
 '"fuck right': 37,
 '"girl': 38,
 '"go': 39,
 '"go talk': 40,
 '"go talk hoes"': 41,
 '"good': 42,
 '"he': 43,
 '"hey': 44,
 '"hoe': 45,
 '"hoes"': 46,
 '"i': 47,
 '"i ain\'t': 48,
 '"i got': 49,
 '"i hate': 50,
 '"i love': 51,
 '"i play': 52,
 '"i play soccer': 53,
 '"i want': 54,
 '"i\'m': 55,
 '"if': 56,
 '"in': 57,
 '"it': 58,
 '"it\'': 59,
 '"let': 60,
 '"look': 61,
 '"mi': 62,
 '"nah': 63,
 '"new': 64,
 '"nigga': 65,
 '"nigger"': 66,
 '"no': 67,
 '"oh': 68,
 '"o

In [6]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    #for i in range(0, len(tokens)):
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)
        #print(tokens[i],tag_list[i])

In [7]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.501,
    )

In [8]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}
pos.shape

(24783, 571)

In [9]:
#Now get other features
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    +
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA,
                FRE,
                syllables,
                avg_syl,
                num_chars,
                num_chars_total,
                num_terms,
                num_words,
                num_unique_terms,
                sentiment['neg'],
                sentiment['pos'],
                sentiment['neu'],
                sentiment['compound'],
                twitter_objs[2],
                twitter_objs[1],
                twitter_objs[0],
                retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [10]:
other_features_names = ["FKRA",
                        "FRE",
                        "num_syllables",
                        "avg_syl_per_word",
                        "num_chars",
                        "num_chars_total",
                        "num_terms",
                        "num_words",
                        "num_unique_words",
                        "vader neg",
                        "vader pos",
                        "vader neu",
                        "vader compound",
                        "num_hashtags",
                        "num_mentions",
                        "num_urls",
                        "is_retweet"]

In [11]:
feats = get_feature_array(tweets)
print(feats)

[[  8.8   73.15  30.   ...   1.     0.     0.  ]
 [  5.9   77.81  19.   ...   1.     0.     0.  ]
 [  6.5   80.46  23.   ...   2.     0.     1.  ]
 ...
 [  3.1   96.03  15.   ...   0.     0.     0.  ]
 [  0.6  103.05   8.   ...   0.     0.     0.  ]
 [  9.8   55.22  27.   ...   0.     1.     0.  ]]


In [12]:
feats.shape

(24783, 17)

In [13]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats],axis=1)

In [14]:
M.shape

(24783, 7859)

In [15]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.items():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.items():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

# Running the model

This model was found using a GridSearch with 5-fold cross validation. Details are in the notebook operational_classifier.

In [16]:
X = pd.DataFrame(M)
y = df['class'].astype(int)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [18]:
select = SelectFromModel(LogisticRegression(class_weight='balanced',penalty="l1",C=0.01))
X_ = select.fit_transform(X,y)



In [19]:
select.get_params

<bound method BaseEstimator.get_params of SelectFromModel(estimator=LogisticRegression(C=0.01, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=None)>

In [20]:
model = LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_, y)



In [21]:
y_preds = model.predict(X_)

In [22]:
report = classification_report( y, y_preds )

In [23]:
print(report)

              precision    recall  f1-score   support

           0       0.44      0.56      0.49      1430
           1       0.97      0.89      0.93     19190
           2       0.76      0.95      0.85      4163

   micro avg       0.88      0.88      0.88     24783
   macro avg       0.72      0.80      0.76     24783
weighted avg       0.90      0.88      0.89     24783



# Using information from the model to obtain the matrix X_ generically

This is the most difficult task: We have to take the inputs tweets and transform them into a format that can be used in the model without going through all the same pre-processing steps as above. This can be done as follows.

## Obtaining information about the model

In [24]:
final_features = select.get_support(indices=True) #get indices of features
final_feature_list = [str(feature_names[i]) for i in final_features] #Get list of names corresponding to indices

In [25]:
len(final_feature_list)

205

In [26]:
print(final_feature_list)

['"bitch', '"bitch"', '"nigger"', '#faggot', '#tcot', '#yanke', '-', '2', 'activ', "ain't", 'al', 'america', 'american', 'anoth', 'ape', 'ass', 'ass hoe', 'ass nigga', 'bad', 'beaner', 'big', 'bird', 'bitch', 'bitch nigga', 'bitch!', 'bitch"', 'black', 'border', 'bout', 'browni', 'buy', 'campu', 'charli', 'chill', 'chink', 'chug', 'clam', 'color', 'color folk', 'coon', 'countri', 'cracker', 'crazi', 'crow', 'cunt', 'da', 'damn', 'darki', 'dick', 'die', 'doe', 'dyke', 'dyke bitch', 'fag', 'faggot', 'fat', 'femal', 'feminist', 'filth', 'first', 'folk', 'fucc nicca', 'fuck', 'fuckin', 'fuzzi', 'game', 'gay', 'girl', 'go', 'good', 'gook', 'got', 'got nigga', 'hate', 'hate hoe', 'head', 'hi', 'ho', 'hoe', 'homo', 'hood', 'human', "i'm", 'jew', 'jig', 'jihadi', 'kill', 'lame', 'latina', 'least', 'let', 'like', 'lmfao', 'lol', 'look like', 'love', 'may', 'mention', 'mexican', 'mock', 'money', 'monkey', 'muslim', 'muzzi', 'negro', 'nicca', 'nig', 'nigga', 'nigga tri', 'niggah', 'niggaz', 'nigg

In [28]:
#Getting names for each class of features

# ngram_features = final_feature_list[:final_feature_list.index('zimmerman')+ 1]
# pos_features = final_feature_list[final_feature_list.index('zimmerman')+1:final_feature_list.index('VBZ DT JJ')+1]
# oth_features = final_feature_list[final_feature_list.index('FKRA'):]

ngram_features = final_feature_list[:final_feature_list.index('zimmerman')+ 1]
pos_features = final_feature_list[final_feature_list.index('zimmerman')+1:final_feature_list.index('VBP JJ')+1]
oth_features = final_feature_list[final_feature_list.index('FKRA'):]


In [29]:
print(f'ngram features: {len(ngram_features)}')
print(f'pos features: {len(pos_features)}')
print(f'other features: {len(oth_features)}')
print(len(oth_features)+len(pos_features)+len(ngram_features))

ngram features: 181
pos features: 10
other features: 14
205


## Generating ngram features

In [30]:
new_vocab = {v:i for i, v in enumerate(ngram_features)}
new_vocab_to_index = {}
for k in ngram_features:
    new_vocab_to_index[k] = vocab[k]

In [31]:
#Get indices of text features
ngram_indices = final_features[:len(ngram_features)]

In [32]:
#TODO: Pickle new vectorizer

In [33]:
new_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_vocab
    )

In [34]:

# joblib.dump(new_vectorizer, 'py3models/final_tfidf.pkl') 

In [35]:
tfidf_ = new_vectorizer.fit_transform(tweets).toarray()
tfidf_.shape

  'stop_words.' % sorted(inconsistent))


(24783, 181)

In [36]:
#Verifying that results are the same

In [37]:
tfidf_[1,:]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [38]:
tfidf_[1,:].sum()

2.0

In [39]:
X_[1,:tfidf_.shape[1]]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 4.84692478, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [40]:
X_[1,:tfidf_.shape[1]].sum()

7.705379799270522

Results are the same if use IDF but the problem is that IDF will be different if we use different data. Instead we have to use the original IDF scores and multiply them by the new matrix.

In [41]:
idf_vals_ = idf_vals[ngram_indices]

In [42]:
idf_vals_.shape

(181,)

In [43]:
#TODO: Pickle idf_vals

# joblib.dump(idf_vals_, 'py3models/final_idf.pkl')


In [44]:
(tfidf_[1,:]*idf_vals_) == X_[1,:153] #Got same value as final process array!

  """Entry point for launching an IPython kernel.


False

In [45]:
tfidf_*idf_vals_ == X_[:,:153]

  """Entry point for launching an IPython kernel.


False

In [46]:
tfidffinal = tfidf_*idf_vals_
tfidffinal.shape

(24783, 181)

## Generating POS features
This is simpler as we do not need to worry about IDF but it will be slower as we have to compute the POS tags for the new data. Here we can simply use the old POS tags.

In [47]:
new_pos = {v:i for i, v in enumerate(pos_features)}

In [48]:
#TODO: Pickle pos vectorizer
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
new_pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_pos
    )

In [49]:
# joblib.dump(new_pos_vectorizer, 'py3models/final_pos.pkl') 
# joblib.dump(model, 'py3models/final_mdl.pkl')

In [50]:
pos_ = new_pos_vectorizer.fit_transform(tweet_tags).toarray()
pos_.shape

(24783, 10)

In [51]:
pos_[1,:]

array([0., 0., 1., 0., 0., 0., 0., 0., 2., 0.])

In [52]:
X_[1,153:159]

array([0., 0., 0., 0., 0., 0.])

In [53]:
pos_[:,:] == X_[:,153:159]

  """Entry point for launching an IPython kernel.


False

In [54]:
pos_[:,:].sum()

92309.0

In [55]:
X_[:,153:159].sum()

13990.000798848323

## Finally, we can look at the other features

In [56]:
print(other_features_names)

print(len(other_features_names))

['FKRA', 'FRE', 'num_syllables', 'avg_syl_per_word', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words', 'vader neg', 'vader pos', 'vader neu', 'vader compound', 'num_hashtags', 'num_mentions', 'num_urls', 'is_retweet']
17


In [57]:
print(oth_features)
print(len(oth_features))

['FKRA', 'FRE', 'num_syllables', 'avg_syl_per_word', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words', 'vader neu', 'vader compound', 'num_hashtags', 'num_mentions', 'num_urls']
14


In [58]:
for x in oth_features:
    print(x)

FKRA
FRE
num_syllables
avg_syl_per_word
num_chars
num_chars_total
num_terms
num_words
num_unique_words
vader neu
vader compound
num_hashtags
num_mentions
num_urls


The functions can be modified to only calculate and return necessary fields.

In [59]:
def other_features_(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    features = [FKRA,
                FRE,
                syllables,
                avg_syl,
                num_chars,
                num_chars_total,
                num_terms,
                num_words,
                num_unique_terms,
                sentiment['neu'],
                sentiment['compound'],
                twitter_objs[2],
                twitter_objs[1],
                twitter_objs[0]]

    #features = pandas.DataFrame(features)
    
    return features

def get_feature_array_(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features_(t))
    return np.array(feats)

In [60]:
feats_ = get_feature_array_(tweets)
feats_.shape

(24783, 14)

In [61]:
feats_[0,:]

array([  8.8   ,  73.15  ,  30.    ,   1.3043, 120.    , 140.    ,
        25.    ,  23.    ,  21.    ,   0.88  ,   0.4563,   0.    ,
         1.    ,   0.    ])

In [62]:
X_[0,174:]

array([  0.    ,   0.    ,   0.    ,   0.    ,   0.    ,   0.    ,
         0.    ,   0.    ,   0.    ,   3.    ,   0.    ,   0.    ,
         1.    ,   4.    ,   0.    ,   1.    ,   0.    ,   8.8   ,
        73.15  ,  30.    ,   1.3043, 120.    , 140.    ,  25.    ,
        23.    ,  21.    ,   0.88  ,   0.4563,   0.    ,   1.    ,
         0.    ])

In [63]:
feats_[:,:] == X_[:,159:]

  """Entry point for launching an IPython kernel.


False

## Now that we have put it all together using a simplified process we can assess if these new data return the same answers.

In [64]:
M_ = np.concatenate([tfidffinal, pos_, feats_],axis=1)

In [65]:
M_.shape

(24783, 205)

In [66]:
X__ = pd.DataFrame(M_)

In [67]:
y_preds_ = model.predict(X__)

In [68]:
report = classification_report( y, y_preds_ )

In [69]:
print(report)

              precision    recall  f1-score   support

           0       0.44      0.56      0.49      1430
           1       0.97      0.89      0.93     19190
           2       0.76      0.95      0.85      4163

   micro avg       0.88      0.88      0.88     24783
   macro avg       0.72      0.80      0.76     24783
weighted avg       0.90      0.88      0.89     24783



OK. So now that we have verified that the results are the same with X_ and X__ we can implement a script that can transform new data in this manner.

In [70]:
X__.shape

(24783, 205)

In [71]:
joblib.dump(idf_vals_, 'py3models/final_idf2.pkl')
joblib.dump(new_pos_vectorizer, 'py3models/final_pos2.pkl') 
joblib.dump(model, 'py3models/final_mdl2.pkl')
joblib.dump(new_vectorizer, 'py3models/final_tfidf2.pkl')

['py3models/final_tfidf2.pkl']