In [1]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
#printmd('**bold**')

In [3]:
# skip this field if "combined_set.csv" exists
parsed_csv = 'combined_set.csv'

if not os.path.isfile(parsed_csv):
    feeds = pd.read_json('./feeds_2000.ndjson', lines=True)
    labels = pd.read_json('./labels_2000.ndjson', lines=True)

    # not all IDs are found in the tiny labels file, make sure we have a complete (albeit fake, dataset)
    import random
    valid_ids = list(feeds.id)  # the ids found in the feeds_2000
    random.shuffle(valid_ids)  # shuffle these and assign random IDs that exist 
    labels.id = valid_ids
    combined = pd.merge(feeds, labels, on='id')
    expanded = combined.set_index(
        ['id', 'birthyear', 'fame', 'gender', 'occupation']
    )['text'].apply(pd.Series).stack()
    expanded = expanded.reset_index()
    expanded = expanded.drop(columns=['level_5'])  # level_5 is the auto-generated new column, containing an index

    expanded.to_csv(parsed_csv)
else:
    print('file exists, continue!')

file exists, continue!


In [4]:
data_path = os.path.join(os.getcwd(), parsed_csv)

In [5]:
#data_raw = pd.read_csv(data_path)
data_raw = pd.read_csv(open(data_path,'r'), encoding='utf-8', engine='c')
data_raw.columns = ['index', 'id', 'birthyear', 'fame', 'gender', 'occupation', 'text']
data_raw = data_raw.drop(columns=['index', 'id'])  # no need for any non-label data

print(data_raw.shape)  # before dropping NaN values

data_raw = data_raw.dropna()
data_raw['birthyear'] = data_raw['birthyear'].astype(int)  # from 1978.0 -> 1978
print(data_raw.shape)


(5220005, 5)
(5219253, 5)


In [6]:
print("Number of rows in data =",data_raw.shape[0])
print("Number of columns in data =",data_raw.shape[1])
print("\n")
printmd("**Sample data:**")
data_raw.head(30)

Number of rows in data = 5219253
Number of columns in data = 5




**Sample data:**

Unnamed: 0,birthyear,fame,gender,occupation,text
0,1991,star,male,performer,Back at it with @americanidol looking for...he...
1,1991,star,male,performer,"Can’t buy all the happiness in the world, it’s..."
2,1991,star,male,performer,30 down @nytimes 🤝🧡
3,1991,star,male,performer,"📸 @ronyalwin 💘 @ New York, New York https://t...."
4,1991,star,male,performer,🎀 pink it was love at first sight 🎀 @ New York...
5,1991,star,male,performer,Putting my best foot forward in The Memphis by...
6,1991,star,male,performer,Girls UNITED can never be divided! 👯‍♀️❤️ http...
7,1991,star,male,performer,BRB buying The Stephanie bow shoe as an early ...
8,1991,star,male,performer,"Connect the dots, bbs #TheClara - TeamKP @kpc..."
9,1991,star,male,performer,Geometry was The Daina’s favorite subject in s...


In [7]:
categories = list(data_raw.columns.values)
categories = categories[:-1]
print(categories)

['birthyear', 'fame', 'gender', 'occupation']


## 2. Data Pre-Processing

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import WhitespaceTokenizer

tokenizer = WhitespaceTokenizer()
stop_words = set(stopwords.words('english'))
stop_words.update(['rt'])  # remove the retweet tag!

stemmer = SnowballStemmer("english")

import re

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [42]:
data = data_raw
# uncomment below to keep a subset of rows for testing
numrows = 500000
data = data_raw.loc[np.random.choice(data_raw.index, size=numrows)]
data.shape

(500000, 5)

### 2.1. Cleaning Data

In [43]:
def remove_links_and_html(sentence):
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'<[^<]+?>', '', sentence)

    return sentence

def remove_punct(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned

def remove_mentions(sentence):
    return re.sub(r'@#?\b\w\w+\b', '', sentence)

def valid_token(tok):
    if '#' in tok:
        # make sure the hashtag is alphanumeric (avoiding arabic etc)
        return re.sub('[^0-9a-zA-Z]+', '', tok) != ''
    non_stop = tok not in stop_words
    no_rt = 'rt' not in tok
    is_latin = re.sub('[^0-9a-zA-Z]+', '', tok) == tok
    return is_latin and non_stop

def clean_stopwords(sentence):
    tokens = tokenizer.tokenize(sentence)
    return ' '.join([t for t in tokens if valid_token(t)])
        
def stem(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join([t for t in tokens if valid_token(t)])

def empty_to_nan(sentence):
    if len(sentence) < 1:
        return np.nan
    else:
        return sentence

def clean_all(s):
    #s = s.lower()
    s = remove_links_and_html(s)
    s = remove_punct(s)
    s = remove_mentions(s)
    s = clean_stopwords(s)
    # stemming is slow on loads of data, consider uncommenting on big sets.
    #s = stem(s)
    # finally, make sure we have no empty texts
    s = empty_to_nan(s)
    return s


In [44]:
import time

start = time.time()

data['text'] = data['text'].str.lower()
data['text'] = data['text'].apply(clean_all)
# run time: around 3-4 minutes per 1 million texts
end = time.time()
print(end - start)

# prune empty texts
data = data.dropna()
data.head(30)

25.808963775634766


Unnamed: 0,birthyear,fame,gender,occupation,text
1749932,1994,star,female,sports,cheeky sizzles bop sugar
1241123,1992,star,male,sports,j alvarez haters remix ft bad bunny almighty
5176064,1948,rising,male,science,nyt reporter broke comey story also broke stor...
3174102,1987,superstar,male,sports,congrats college graduation
1307889,1952,star,male,performer,#gps ft incluido en disponible en todas las pl...
1497675,1965,star,male,performer,produced frank dukes
4540824,1986,star,male,sports,ah got matching purse
4770641,1973,superstar,male,performer,yeah u
1811103,1993,star,male,sports,new years resolutions
2403381,1974,star,male,professional,#julie2 trailer trending youtube wait see perf...


In [None]:
data.to_csv('PREPROCESSED.csv')

In [None]:
data = pd.read_csv('PREPROCESSED.csv')

In [45]:
data = data.dropna()

### 2.4. Train-Test Split

In [46]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, random_state=42, test_size=0.20, shuffle=True)

#train.to_csv('formatted/train.csv')
print(train.shape)
print(test.shape)

(372060, 5)
(93015, 5)


In [47]:
train_text = train['text']
test_text = test['text']

### 2.5. TF-IDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
#x = v.fit_transform(df['Review'].values.astype('U'))  ## Even astype(str) would work
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [49]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['text', 'birthyear'], axis=1)
print(y_train.shape)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['text', 'birthyear'], axis=1)
print(y_test.shape)

(372060, 3)
(93015, 3)


In [50]:
from sklearn.preprocessing import LabelEncoder
y_train = y_train.apply(LabelEncoder().fit_transform)
print(y_train.head())
y_test = y_test.apply(LabelEncoder().fit_transform)


         fame  gender  occupation
1689826     0       1           3
2339612     0       1           7
1609231     1       0           2
521878      1       0           7
5114930     1       1           7


In [38]:

# reverse
from collections import defaultdict
df = y_train
d = defaultdict(LabelEncoder)
# Encoding the variable
fit = df.apply(lambda x: d[x.name].fit_transform(x))

# Inverse the encoded
fit.apply(lambda x: d[x.name].inverse_transform(x))

# Using the dictionary to label future data
df = df.apply(lambda x: d[x.name].transform(x))
print(df.head())
df = y_train.apply(LabelEncoder().fit_transform)
df.head()

         fame  gender  occupation
2011005     1       0           2
3416909     1       1           2
649605      1       0           0
1052140     1       0           2
1884184     1       1           7


Unnamed: 0,fame,gender,occupation
2011005,1,0,2
3416909,1,1,2
649605,1,0,0
1052140,1,0,2
1884184,1,1,7


## 3. Multi-Label Classification

### 3.1. Multiple Binary Classifications - (One Vs Rest Classifier)

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [52]:
categories = list(data.columns.values)
categories = categories[1:-1]
print(categories)

# num occps
for categ in categories:
    print('vals in ', categ)
    print(y_train[categ].unique())


['fame', 'gender', 'occupation']
vals in  fame
[0 1 2]
vals in  gender
[1 0]
vals in  occupation
[3 7 2 0 6 1 4 5]


In [53]:
%%time
# Using pipeline for applying logistic regression and one vs rest classifier
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])

for category in categories:
    printmd('**Processing {} comments...**'.format(category))
    
    # Training logistic regression model on train data
    LogReg_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = LogReg_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing fame comments...**

Test accuracy is 0.7505456109229695




**Processing gender comments...**

Test accuracy is 0.731226146320486




**Processing occupation comments...**

Test accuracy is 0.46734397677793904


CPU times: user 6.67 s, sys: 505 ms, total: 7.18 s
Wall time: 1min 6s


### 3.2. Multiple Binary Classifications - (Binary Relevance)

In [None]:
%%time

# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

### 3.3. Classifier Chains

In [54]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression

In [None]:
%%time

# initialize classifier chains multi-label classifier
classifier = ClassifierChain(LogisticRegression())

# Training logistic regression model on train datano
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

### 3.4. Label Powerset

In [None]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset

In [None]:
%%time

# initialize label powerset multi-label classifier
classifier = LabelPowerset(LogisticRegression())

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")