# NLP: Text Classification using scikit-learn, python and NLTK.
Sources:  
- https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a  
- https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html  

**Method:** Naive Bayes classifier for multinomial models  
Applied to data from NDSC (https://www.kaggle.com/c/ndsc-advanced)

### Reading the data

In [2]:
import pandas as pd
import numpy as np

# Read the data
mobile_data = pd.read_csv('data/mobile_data_info_train_competition.csv', encoding='utf8')
fashion_data = pd.read_csv('data/fashion_data_info_train_competition.csv', encoding='utf8')
beauty_data = pd.read_csv('data/beauty_data_info_train_competition.csv', encoding='utf8')

In [5]:
def getFeatures(data):
    features = data.drop(columns=['itemid', 'title', 'image_path'])
    return list(features)

print("Features in mobile data")
print(getFeatures(mobile_data))

print("\nFeatures in fashion data")
print(getFeatures(fashion_data))

print("\nFeatures in beauty data")
print(getFeatures(beauty_data))

Features in mobile data
['Operating System', 'Features', 'Network Connections', 'Memory RAM', 'Brand', 'Warranty Period', 'Storage Capacity', 'Color Family', 'Phone Model', 'Camera', 'Phone Screen Size']

Features in fashion data
['Pattern', 'Collar Type', 'Sleeves', 'Fashion Trend', 'Clothing Material']

Features in beauty data
['Colour_group', 'Brand', 'Benefits', 'Product_texture', 'Skin_type']


### Data Preprocessing
1st iteration: Dropping all NaNs. Future iterations to improve data quality or to try other techniques

In [6]:
def dataPreprocessing(data):
    features = getFeatures(data)
    train_data = []

    for feature in features:
        data_single_feature = data[['title', feature]]
        test_df = data_single_feature.dropna()  # 1st iteration, drop all NaNs
        numpy_array = test_df.as_matrix()
        X = numpy_array[:, 0]  # words
        Y = numpy_array[:, 1]  # value of OS
        Y = Y.astype(int)  # need to cast to int for later use
        train_data.append((X, Y))

    return train_data

In [13]:
train_data = dataPreprocessing(mobile_data)

### Train model and test

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

def train(train_data):
    text_clf_list = []
    text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', MultinomialNB()),
                         ])

    for data in train_data:
        X = data[0]
        Y = data[1]
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, test_size=0.4, random_state=42)
        text_clf = text_clf.fit(X_train, Y_train)
        text_clf_list.append(text_clf)

    return text_clf_list

In [15]:
# Train a text classifier
text_clf_list = train(train_data)

### Test with mobile_data_info_val_competition.csv

In [17]:
def test(text_clf_list, val_data_csv):
    val_data = pd.read_csv(val_data_csv, encoding='utf8')
    X_test = val_data['title']
    predicted_list = []

    for text_clf in text_clf_list:
        predicted = text_clf.predict(X_test)
        predicted_list.append(predicted)

    return predicted_list

In [18]:
# Test the model
val_data_csv = 'data/mobile_data_info_val_competition.csv'
predicted_list = test(text_clf_list, val_data_csv)
print(predicted_list) # Note: This is only top-1 prediction. 

[array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5]), array([2, 5, 2, ..., 5, 5, 5])]


**Note:** This is currently only top-1 prediction. Top-2 prediction is needed for a valid submission format.

**TODO:** Get top-2 prediction  
**TODO:** Create submission format