# TAMU Datathon - Bloomberg Challenge (post-competition work)

I wanted to revisit this project, and see it to completion.

Part 1 was guessing what 5 embeddings' original news articles were about (see other notebook).

Part 2 was building a general classifier for Bloomberg's embedding system.

This will have two phases:

A: A genre classifier for bodies of text, trained on a large set of articles. This will be used to generate genre labels for the set of ~1,000 embeddings we were originally given at the start of the contest.

B: A genre classifier for Bloomberg's embeddings. This will only be trained using the ~1,000 embeddings we were given as features, and the labels generated in Part A as targets.

# PART 2: Classifier

## Phase A: Genre Classifier, Label Generation for Embedding Set

First, import the training sets (texts with genre labels) and embedding data (embeddings with texts).

Second, preprocess the data using TF-IDF vectors.

Third, train and cross-validate a supervised learning model for label generation.

Fourth, once the model is satisfactory, apply to the embedding data.

Fifth, export the embedding-label pairings (so I don't have to repeat this cell).

Training the label classifier

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
import pandas as pd
import os
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

os.chdir('./BBC_Training_data/')
os.chdir('./business/')

# next, make a NP array of every txt file in the directory
SampleList = np.array(os.listdir())
SampleList = SampleList[ np.char.endswith(SampleList, '.txt') ] #ignore everything that's not a txt file.

#make a fresh blank series.
Business = pd.Series(dtype='str')

for SampleName in SampleList:
    with open(SampleName) as f:
        contents = np.array(f.readlines(), dtype = 'str')
    contents = [line.rstrip('\n') for line in contents]
    #contents.str.rstrip('\n')
    Business[SampleName] = ' '.join(contents)

#print(Business.head())


os.chdir('../')
os.chdir('./entertainment/')
# next, make a NP array of every txt file in the directory
SampleList = np.array(os.listdir())
SampleList = SampleList[ np.char.endswith(SampleList, '.txt') ] #ignore everything that's not a txt file.
#make a fresh blank series.
Entertainment = pd.Series(dtype='str')
for SampleName in SampleList:
    with open(SampleName) as f:
        contents = np.array(f.readlines(), dtype = 'str')
    contents = [line.rstrip('\n') for line in contents]
    #contents.str.rstrip('\n')
    Entertainment[SampleName] = ' '.join(contents)

#print(Entertainment.head())


os.chdir('../')
os.chdir('./politics/')
# next, make a NP array of every txt file in the directory
SampleList = np.array(os.listdir())
SampleList = SampleList[ np.char.endswith(SampleList, '.txt') ] #ignore everything that's not a txt file.
#make a fresh blank series.
Politics = pd.Series(dtype='str')
for SampleName in SampleList:
    with open(SampleName) as f:
        contents = np.array(f.readlines(), dtype = 'str')
    contents = [line.rstrip('\n') for line in contents]
    #contents.str.rstrip('\n')
    Politics[SampleName] = ' '.join(contents)

#print(Politics.head())


os.chdir('../')
os.chdir('./sport/')
# next, make a NP array of every txt file in the directory
SampleList = np.array(os.listdir())
SampleList = SampleList[ np.char.endswith(SampleList, '.txt') ] #ignore everything that's not a txt file.
#make a fresh blank series.
Sports = pd.Series(dtype='str')
for SampleName in SampleList:
    with open(SampleName) as f:
        contents = np.array(f.readlines(), dtype = 'str')
    contents = [line.rstrip('\n') for line in contents]
    #contents.str.rstrip('\n')
    Sports[SampleName] = ' '.join(contents)

#print(Sports.head())


os.chdir('../')
os.chdir('./tech/')
# next, make a NP array of every txt file in the directory
SampleList = np.array(os.listdir())
SampleList = SampleList[ np.char.endswith(SampleList, '.txt') ] #ignore everything that's not a txt file.
#make a fresh blank series.
Technology = pd.Series(dtype='str')
for SampleName in SampleList:
    with open(SampleName) as f:
        contents = np.array(f.readlines(), dtype = 'str')
    contents = [line.rstrip('\n') for line in contents]
    #contents.str.rstrip('\n')
    Technology[SampleName] = ' '.join(contents)

#print(Technology.head())


vec = TfidfVectorizer()
X = vec.fit_transform(pd.concat([Business, Entertainment, Politics, Sports, Technology]))
Features = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

BusLab = np.full((int(len(Business)), 1), 'Business')
EntLab = np.full((int(len(Entertainment)), 1), 'Entertainment')
PolLab = np.full((int(len(Politics)), 1), 'Politics')
SpoLab = np.full((int(len(Sports)), 1), 'Sports')
TecLab = np.full((int(len(Technology)), 1), 'Technology')
Targets = np.concatenate([BusLab, EntLab, PolLab, SpoLab, TecLab])

print(Features.head())


# Next, get the embedding data.

os.chdir('../')
os.chdir('../')
cnn_samples0 = pd.read_csv('cnn_samples-54b19b96f3c0775b116bad527df8c7b5.csv')

# Wrangling the data from strings to NP arrays.
cnn_samples1 = np.fromstring((cnn_samples0.values[0,3]).replace('[','').replace(']',''), sep=',').reshape(1,512)

# Rebuilding the DataFrame after this, with headline as index.
for i in np.arange(1,np.shape(cnn_samples0)[0]):
    temp = np.fromstring((cnn_samples0.values[i,3]).replace('[','').replace(']',''), sep=',').reshape(1,512)
    cnn_samples1 = np.vstack([cnn_samples1, temp])
cnn_samples = pd.DataFrame(cnn_samples1, index = cnn_samples0['text'])

#Repeating the process for the challenge data.
gov_samples0 = pd.read_csv('federal_samples-a586d0681e005629453435bea5b173eb.csv')
gov_samples1 = np.fromstring((gov_samples0.values[0,3]).replace('[','').replace(']',''), sep=',').reshape(1,512)
for i in np.arange(1,np.shape(gov_samples0)[0]):
    temp = np.fromstring((gov_samples0.values[i,3]).replace('[','').replace(']',''), sep=',').reshape(1,512)
    gov_samples1 = np.vstack([gov_samples1, temp])
gov_samples = pd.DataFrame(gov_samples1, index = gov_samples0['text'])

#also need to merge the two DataFrames
cnngov_samples = pd.concat([cnn_samples, gov_samples], axis = 'rows')


# TF-IDF word frequency counter (for articles)
vec2 = TfidfVectorizer()
embeddingText = vec2.fit_transform(cnngov_samples.index.values)
embeddingX = pd.DataFrame(embeddingText.toarray(), columns=vec2.get_feature_names())


# Need to make a common set of word columns for the Features and embeddingX TF-IDF dataframes (i.e. union).

rowOfZeros1 = pd.DataFrame(0, index = ['0'], columns = embeddingX.columns.values)
rowOfZeros2 = pd.DataFrame(0, index = ['0'], columns = Features.columns.values)

Features = pd.concat([Features, rowOfZeros1]).fillna(0)
embeddingX = pd.concat([embeddingX, rowOfZeros2]).fillna(0)

Features = Features.drop(index = '0')
embeddingX = embeddingX.drop(index = '0')

#Let's try k-NN classification.

TextClassifier = KNeighborsClassifier(n_neighbors=7)
TextClassifier.fit(Features, np.ravel(Targets))

#Need to cross-validate this model.

print(cross_val_score(TextClassifier, Features, np.ravel(Targets), cv=5))

# finally, use the KNN model previously constructed to assign (predict) genre labels.

embeddingLabels = TextClassifier.predict(embeddingX)

print(embeddingLabels)
print(type(embeddingLabels))

print(TextClassifier.predict_proba(embeddingX))

cnngov_samples.to_csv('000cnngov_samples.csv')
pd.DataFrame(embeddingLabels, columns = ['Targets']).to_csv('000embeddingLabels.csv')

    00       000  0001  000bn  000m  000s  000th  001  001and  001st  ...  \
0  0.0  0.020868   0.0    0.0   0.0   0.0    0.0  0.0     0.0    0.0  ...   
1  0.0  0.000000   0.0    0.0   0.0   0.0    0.0  0.0     0.0    0.0  ...   
2  0.0  0.000000   0.0    0.0   0.0   0.0    0.0  0.0     0.0    0.0  ...   
3  0.0  0.018989   0.0    0.0   0.0   0.0    0.0  0.0     0.0    0.0  ...   
4  0.0  0.000000   0.0    0.0   0.0   0.0    0.0  0.0     0.0    0.0  ...   

   zooms  zooropa  zornotza  zorro  zubair  zuluaga  zurich  zutons  \
0    0.0      0.0       0.0    0.0     0.0      0.0     0.0     0.0   
1    0.0      0.0       0.0    0.0     0.0      0.0     0.0     0.0   
2    0.0      0.0       0.0    0.0     0.0      0.0     0.0     0.0   
3    0.0      0.0       0.0    0.0     0.0      0.0     0.0     0.0   
4    0.0      0.0       0.0    0.0     0.0      0.0     0.0     0.0   

   zvonareva  zvyagintsev  
0        0.0          0.0  
1        0.0          0.0  
2        0.0          0.0 

[[0.14285714 0.42857143 0.14285714 0.         0.28571429]
 [0.         0.         0.57142857 0.28571429 0.14285714]
 [0.28571429 0.42857143 0.         0.         0.28571429]
 ...
 [0.28571429 0.         0.28571429 0.14285714 0.28571429]
 [0.         0.42857143 0.14285714 0.14285714 0.28571429]
 [0.         0.         0.         0.71428571 0.28571429]]


AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

## Phase B: Genre Classifier for the Embeddings

First, import the CSV file generated by the previous cell.

Second, train and cross-validate a supervised learning model for label generation.

Third, once the model is satisfactory, predict the genres of the five challenge articles to test.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.mixture import GaussianMixture
# Going to try a custom estimator based on Gaussian mixtures, based on promising results
# from a scrapped idea at the end of the notebook I submitted at the competition.

# Crucial note: Gaussian Mixtures is unsupervised, while Bayes is supervised.

class CustomBayesClassifier(BaseEstimator, ClassifierMixin):
    """Parameters
    ----------
    n : int
        Number of clusters total (a single label can have several clusters).
    covariance_type : str
        Controls the degrees of freedom in the shape of each cluster.
        Three common options: 'full' (default), 'diag', or 'spherical'.
    """
    def __init__(self, n, covariance_type = 'full', random_state=0):
        self.n = n
        self.covariance_type = covariance_type
        self.random_state = random_state # for reproducibility.
        
    def fit(self, X, y):
        self.classes_ = np.sort(np.unique(y))
        training_sets = [X[y == yi] for yi in self.classes_]
        self.models_ = [GaussianMixture(n=self.n, covariance_type=self.covariance_type, random_state=self.random_state).fit(Xi)
                        for Xi in training_sets]
        self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
                           for Xi in training_sets]
        return self
        
    def predict_proba(self, X):
        logprobs = np.array([model.score_samples(X)
                             for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        return result / result.sum(1, keepdims=True)
        
    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), 1)]


Feat = pd.read_csv('000cnngov_samples.csv', header = 0, index_col = 0)
Targ = pd.read_csv('000embeddingLabels.csv', header = 0, index_col = 0)

Model = CustomBayesClassifier(n = 10)
Model.fit(Feat, Targ)

#Will cross-validate once I work out all the bugs.
#print(cross_val_score(TextClassifier, Features, np.ravel(Targets), cv=5))

Predicting the genre of the 5 challenge embedding and the bonus embedding.

In [None]:
#Reading the challenge data.
challenge0 = pd.read_csv('challenge-ddec63cf66ea88f128e3c21e457f393a.csv')
challenge1 = np.fromstring((challenge0.values[0,1]).replace('[','').replace(']',''), sep=',').reshape(1,512)
for i in np.arange(1,np.shape(challenge0)[0]):
    temp = np.fromstring((challenge0.values[i,1]).replace('[','').replace(']',''), sep=',').reshape(1,512)
    challenge1 = np.vstack([challenge1, temp])
challenge = pd.DataFrame(challenge1, index = challenge0['id'])


#Finally, getting the bonus 6th embedding.
with open('mystery.json') as file:
    mystery0 = json.load(file)['embedding']

mystery = pd.DataFrame(np.array(mystery0).reshape(1,512), index = ['mystery'], columns = np.arange(0,512)) #it's a dict

#Need to merge these two DataFrames.
challenge = pd.concat([challenge, mystery], axis = 'rows')



Unnamed: 0,00,000,001,00145,00232,005,00692,007,010,01081,...,zurich,zvjezdan,zvonareva,zwally,zweig,zynga,zzzz,zzzzz,zzzzzz,état
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
