In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

from features import category_json, category_feature_columns

# Read the data
mobile_data = pd.read_csv('data/mobile_data_info_train_competition.csv', encoding='utf8')
fashion_data = pd.read_csv('data/fashion_data_info_train_competition.csv', encoding='utf8')
beauty_data = pd.read_csv('data/beauty_data_info_train_competition.csv', encoding='utf8')

In [2]:
def getFeatures(data):
    features = data.drop(columns=['itemid', 'title', 'image_path'])
    return list(features)


def df_class_to_text(df, category):
    """This function convert the entire numeric dataframe into text dataframe"""

    map_json = category_json[category]
    column_map = {}
    for column in category_feature_columns[category]:
        column_map[column] = {v: k for k, v in map_json[column].items()}
        df.loc[:, column] = df[column].map(column_map[column])

    return df


def dataPreprocessing(data, category):
    data_text = df_class_to_text(data, category)
    features = getFeatures(data)
    train_data = []

    data_single_feature = data_text[['title', features[0]]]
    test_df = data_single_feature.dropna()  # 1st iteration, drop all NaNs
    numpy_array = test_df.as_matrix()
    print(numpy_array)
    X = numpy_array[:, 0]  # words
    Y = numpy_array[:, 1]  # value of OS
    print(X,Y)
    train_data.append((X, Y))

    return train_data

In [3]:
train_data = dataPreprocessing(mobile_data, "mobile")

[['apple iphone 4s back glass spare part original replacement putih' 'ios']
 ['iphone 4s 64gb white' 'ios']
 ['samsung sm b310e piton dual sim' 'ios']
 ..., 
 ['xiaomi mia1 ram 4gb 64gb black' 'android']
 ['khusus hari ini samsung j2 prime' 'android']
 ['oppo a83 2 gb new garansi resmi 1 tahun' 'android']]
['apple iphone 4s back glass spare part original replacement putih'
 'iphone 4s 64gb white' 'samsung sm b310e piton dual sim' ...,
 'xiaomi mia1 ram 4gb 64gb black' 'khusus hari ini samsung j2 prime'
 'oppo a83 2 gb new garansi resmi 1 tahun'] ['ios' 'ios' 'ios' ..., 'android' 'android' 'android']


In [4]:
train_data

[(array(['apple iphone 4s back glass spare part original replacement putih',
         'iphone 4s 64gb white', 'samsung sm b310e piton dual sim', ...,
         'xiaomi mia1 ram 4gb 64gb black',
         'khusus hari ini samsung j2 prime',
         'oppo a83 2 gb new garansi resmi 1 tahun'], dtype=object),
  array(['ios', 'ios', 'ios', ..., 'android', 'android', 'android'], dtype=object))]

### Training

In [5]:
def train(train_data):
    text_clf_list = []
    text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                         ])
    
    X = train_data[0][0]
    Y = train_data[0][1]
    text_clf_trained = text_clf.fit(X, Y)

    return text_clf_trained

In [6]:
# Train a text classifier
text_clf_list = train(train_data)



### Testing

In [7]:
def test(text_clf, val_data_csv):
    val_data = pd.read_csv(val_data_csv, encoding='utf8')
    X_test = val_data['title']

    predicted = text_clf.predict(X_test)
    return predicted

In [8]:
# Test the model
val_data_csv = 'data/mobile_data_info_val_competition.csv'
predicted= test(text_clf_list, val_data_csv)

In [9]:
print(predicted)

['ios' 'ios' 'ios' ..., 'ios' 'ios' 'ios']
