## Datacamp CapGemini
#### Group 7

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Import data

In [7]:
labeled_data = pd.read_csv('../data/labeled_data.csv', encoding='utf8')
unlabeled_data = pd.read_csv('../data/data_unlabeled.csv', engine='c', encoding='utf8')

In [8]:
labeled_data["text"].sample(20)

1067                                    plenty of memory.
2537    you could get a gaming laptop that has gtx 105...
438     the s8 is great, easy to use and tons of features
7735    combined with the image stabilizers for the ca...
2982    galaxy s8 and s8+ oreo beta may end january 15...
8256    no improvement  in camera pixel 12 mp is less,...
2492    i really appreciated this streamlined process ...
2499    the fingerprint scanner doesn't bother me i wh...
5620    the x is the start of a new line where they te...
4809       i love that they added back expandable memory.
6956    the samsung galaxy line remained top in the li...
1415    shit dude, you might also need a vomit bucket ...
4705    if you are looking for a new mobile device, yo...
4664    its smaller than the 7s, this make it easy to ...
7056    had to get a new phone since i was running out...
8745                           the screen is really nice.
5082    enjoying the new features and cool gadgets tha...
787           

In [9]:
unlabeled_data["text"].sample(10)

29729                      face unlock is not that great .
59483    the no home button addition was quite annoying...
57891                     amazing specs and image quality!
75434    i have own previous i phones, and when i up gr...
10906    only happens on first attempt, then works as e...
55628    stereo speakers in the phone is the most stupi...
23448                  my old s6 charges faster than this.
7646     they do need to patch the software to allow us...
70640    well, compared tomy previous phone (galaxy not...
32663     \r\n\r\n\r\nand dont compare the devices prec...
Name: text, dtype: object

In [13]:
selected_classes = ['screen', 'software_bugs', 'locking_system', 'system', 'apps_update', 'battery_life_charging', 'customerservice']
labeled_data.iloc[:, 1:].loc[:, selected_classes].apply(pd.value_counts)

Unnamed: 0,screen,software_bugs,locking_system,system,apps_update,battery_life_charging,customerservice
0.0,10681,10857,10791,10654,10915,10850,10888
1.0,316,140,206,343,82,147,109


## Feature creation

We start by defining our preprocessing pipeline : 

### Preprocessing pipeline

#### Pre-processing and tokenization

In [8]:
import regex
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords as stpwrds

In [9]:
def preprocess_text(df):
    # Remove ponctuation
    matrix = str.maketrans(",\"_;", "    ", "'’.()/-?!|:><&[]*=@%^â€™")
    df["text"] = df["text"].transform(lambda x: x.translate(matrix))
    
    # Remove bad characters
    df["text"] = df["text"].transform(lambda text: ''.join([x for x in text if ord(x)<128 or ord(x)!=25]))
    
    # Remove hashtags
    df["text"] = df["text"].map(lambda x: regex.sub('#[a-zA-Z0-9-]*', '', x))
    
    # Remove number only strings
    numbers = regex.compile('^[0-9 ]+$')
    mask = df["text"].map(lambda x: not numbers.match(x))
    df = df[mask]
    
    mask = df["text"].map(lambda x: x.strip() == '')
    df = df.loc[~mask]
    
    return df

In [10]:
def tokenize(df):
    # Tokenize
    tweet = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    stopwords = set(stpwrds.words('english'))

    df["text"] = df["text"].transform(tweet.tokenize)
    
    # Remove stopwords
    df["text"] = df["text"].transform(lambda x: [token for token in x if not token in stopwords])
    
    # Remove tokens with only numbers
    numbers = regex.compile('^[0-9]{3,}$')
    df["text"] = df["text"].map(lambda x: [token for token in x if not numbers.match(token)])
    
    # Number of tokens
    df["length"] = df["text"].apply(lambda x: len(x))
    return df

In [11]:
# Detect language
import langdetect

def detect_lang(x):
    try: 
        return langdetect.detect(x)
    except Exception as e:
        return None

#### Lemmatization

In [341]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [342]:
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [12]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

In [13]:
def spacy_lemmatize(tokens, nlp):
    doc = nlp(" ".join(tokens))
    return [token.lemma_.lower().strip() for token in doc if token.lemma_ != "-PRON-"]

#### Custom lemmatization

In [14]:
# Custom tokens
def custom_lemmatize(tokens):
    processed = []
    extend = processed.extend
    length = len(tokens)
    
    for i, token in enumerate(tokens):
        # iPhones
        if token == "x" or token == "10":
            result = ["10"]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token in ["6", "7", "8"]:
            result = [token]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token == "+":
            extend(["plus"])
        extend([token])
        
        # Samsung
        if token == "s8":
            result = ["S8"]
            if i>0 and tokens[i-1] != "samsung":
                result.insert(0, "samsung")
            extend(result)
            continue
        
    return processed

#### Bigrams

In [15]:
from gensim.models.phrases import Phrases, Phraser

In [16]:
def bigrams(column):
    phrases = Phrases(column.values.tolist())
    bigrams = Phraser(phrases)
    return list(bigrams[column.values.tolist()])

#### Final function

In [17]:
def preprocessing_pipeline(df, threshold=4):
    df = preprocess_text(df)
    df = tokenize(df)
    df['text'] = df['text'].progress_map(lambda x: spacy_lemmatize(x, nlp))
    df['text'] = df['text'].apply(custom_lemmatize)
    df['text'] = bigrams(df['text'])
    df['text'] = df['text'].apply(lambda x: [tk for tk in x if tk])
    df = df[df['text'].map(lambda x: len(x) > threshold)]
    return df

### Word/sentence encoding

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_lbl_train, X_lbl_test = train_test_split(labeled_data, test_size=0.2, random_state=42, stratify=labeled_data['issue'])

In [22]:
counts = X_lbl_train.iloc[:, 1:].loc[:, selected_classes].apply(pd.value_counts)
for cat, values in counts.iteritems():
    print("{} - {:.2%} positives".format(cat, values[1]/values[0]))

screen - 3.08% positives
software_bugs - 1.31% positives
locking_system - 1.79% positives
system - 3.14% positives
apps_update - 0.80% positives
battery_life_charging - 1.39% positives
customerservice - 1.08% positives


In [20]:
X_lbl_train = preprocessing_pipeline(X_lbl_train)
X_lbl_test = preprocessing_pipeline(X_lbl_test)
y_lbl_train = X_lbl_train['issue']
y_lbl_test = X_lbl_test['issue']

100%|██████████| 8788/8788 [00:24<00:00, 352.03it/s]
100%|██████████| 2197/2197 [00:06<00:00, 362.03it/s]


#### Doc2Vec (what we use)

In [21]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [22]:
def get_docs(sources):
    for label, source in sources.items():
        for i, tokens in source.iteritems():
            yield TaggedDocument(words=tokens, tags=["{}_{}".format(label, i)])

In [23]:
# Train with train data
sources = {
    'TRAIN': X_lbl_train["text"]
}

reviews = list(get_docs(sources))

In [24]:
model = Doc2Vec(size=500, window=15, min_count=10, workers=10)
model.build_vocab(reviews)
model.train(reviews, epochs=20, total_examples=model.corpus_count)

In [25]:
def d2v(model, data, infer=True): 
    if infer:
        vectors = [model.infer_vector(review) for review in data.values.tolist()]
    else:
        vectors = [model["TRAIN_{}".format(i)] for i in data.index.values.tolist()]
        
    return pd.DataFrame(vectors, index=data.index, columns=["dim_{}".format(i) for i in range(model.vector_size)])

In [26]:
X_lbl_train_d2v = d2v(model, X_lbl_train['text'], infer=False)
X_lbl_test_d2v = d2v(model, X_lbl_test['text'], infer=True)

#### TF-IDF

In [28]:
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
# Get vocabulary
vocab = list(
    set(itertools.chain(*X_lbl_train['text'].tolist()))|
    set(itertools.chain(*X_lbl_test['text'].tolist()))|
    set(itertools.chain(*X_unlbl['text'].tolist()))
)
vocab_dict = dict((y, x) for x, y in enumerate(vocab))

In [30]:
model = TfidfVectorizer(ngram_range=(1,3), use_idf=True, vocabulary=vocab_dict)

In [31]:
X_lbl_train = model.fit_transform(X_lbl_train['text'].apply(lambda x: " ".join(x)).values.tolist())
X_lbl_test = model.transform(X_lbl_test['text'].apply(lambda x: " ".join(x)).values.tolist())
X_unlbl = model.transform(X_unlbl['text'].apply(lambda x: " ".join(x)).values.tolist())

### Semi-supervised learning

#### XGBoost classification without label propagation

In [27]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [28]:
clf = XGBClassifier(max_depth=20, 
                    n_estimators=750, 
                    min_child_weight=3, 
                    scale_pos_weight=2,
                    learning_rate=0.05, 
                    max_delta_step=0.5,
                    gamma=0.01,
                    colsample_bytree=0.8,
                    n_jobs=-1)

In [29]:
clf.fit(X_lbl_train_d2v, y_lbl_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.01, learning_rate=0.05,
       max_delta_step=0.5, max_depth=20, min_child_weight=3, missing=None,
       n_estimators=750, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=2, seed=None, silent=True,
       subsample=1)

In [30]:
print(classification_report(y_lbl_test, clf.predict(X_lbl_test_d2v)))

             precision    recall  f1-score   support

        0.0       0.86      0.90      0.88      1290
        1.0       0.32      0.25      0.28       253

avg / total       0.77      0.79      0.78      1543



  if diff:


#### Prediction on unlabeled data (WIP - not used for the final prediction due to poor results)

First we use our classifier to predict issues on the unlabeled data : 

In [45]:
X_unlbl = preprocessing_pipeline(unlabeled_data)

100%|██████████| 84330/84330 [03:52<00:00, 363.36it/s]


In [46]:
X_unlbl_d2v = d2v(model, X_unlbl['text'], infer=True)

In [None]:
issue_predictions = pd.DataFrame(clf.predict_proba(X_unlbl_d2v)[:, 1], index=X_unlbl_d2v.index)

unlabeled_data['xgb_issue_prob'] = issue_predictions
# Drop rows if no issue proba
unlabeled_data = unlabeled_data.dropna(subset=['xgb_issue_prob'])
# Predict binary output according to a threshold
unlabeled_data['xgb_issue'] = unlabeled_data['xgb_issue_prob'].apply(lambda x: x > 0.4).map(int)

In [None]:
unlabeled_data['xgb_issue'].value_counts()

Then we perform sentiment analysis in order to detect false positives :

In [None]:
from textblob import TextBlob

In [None]:
def sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

In [None]:
unlabeled_data['sentiment'] = unlabeled_data["text"].progress_map(sentiment)

We cross results to spot weird matches :

In [None]:
def combine_preds(row, prob_col_name="xgb_issue", sentiment_col_name="sentiment"):
    return int(row[prob_col_name] == 1 and row[sentiment_col_name] <= 0)

unlabeled_data['issue'] = unlabeled_data.apply(combine_preds, axis=1)

In [None]:
unlabeled_data['issue'].value_counts()

#### Retrain

In [None]:
clf_all = XGBClassifier(max_depth=8, 
                        n_estimators=750, 
                        min_child_weight=3, 
                        scale_pos_weight=2,
                        learning_rate=0.05, 
                        max_delta_step=0.5,
                        gamma=0.01,
                        n_jobs=-1)

In [None]:
X_all = pd.concat([X_lbl_train_d2v, X_unlbl_d2v])
y_all = pd.concat([y_lbl_train, unlabeled_data['issue']])

In [None]:
clf.fit(X_all, y_all)

In [None]:
print(classification_report(y_lbl_test, clf.predict(X_lbl_test_d2v)))

### Hackathon : multi-~~label~~ class problem

We decided to reduce the classification problem to a multiclass one :

In [31]:
selected_classes = ['screen', 'software_bugs', 'locking_system', 'system', 'apps_update', 'battery_life_charging', 'customerservice']

We create a new `issue` indicator column on the class subset and merge all classes dummy variables :

In [32]:
X_lbl_train['issue'] = X_lbl_train.loc[:, selected_classes].apply(lambda x: int(x.any()), axis=1)
X_lbl_test['issue'] = X_lbl_test.loc[:, selected_classes].apply(lambda x: int(x.any()), axis=1)

In [33]:
def encode_class(row):
    if row['issue'] == 1:
        issue_name = row.idxmax(1)
        return selected_classes.index(issue_name)+1
    return 0

In [35]:
y_lbl_train = X_lbl_train.loc[:, selected_classes + ['issue']].apply(encode_class, axis=1)
y_lbl_test = X_lbl_test.loc[:, selected_classes + ['issue']].apply(encode_class, axis=1)

We train the classifier on this categorical vector :

In [36]:
clf = XGBClassifier(max_depth=8, 
                    n_estimators=750, 
                    objective='multi:softprob',
                    min_child_weight=3, 
                    scale_pos_weight=2,
                    learning_rate=0.05, 
                    max_delta_step=0.5,
                    gamma=0.01,
                    n_jobs=-1)

In [37]:
clf.fit(X_lbl_train_d2v, y_lbl_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.01, learning_rate=0.05,
       max_delta_step=0.5, max_depth=8, min_child_weight=3, missing=None,
       n_estimators=750, n_jobs=-1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=2, seed=None, silent=True,
       subsample=1)

In [38]:
print(classification_report(y_lbl_test, clf.predict(X_lbl_test_d2v)))

             precision    recall  f1-score   support

          0       0.88      0.96      0.92      1348
          1       0.25      0.29      0.27        48
          2       1.00      0.04      0.08        23
          3       0.00      0.00      0.00        39
          4       0.00      0.00      0.00        54
          5       0.00      0.00      0.00         6
          6       0.27      0.19      0.22        16
          7       0.00      0.00      0.00         9

avg / total       0.80      0.85      0.82      1543



  if diff:


### Prediction on test data

In [39]:
test_data = pd.read_csv('../data/test_data.csv', encoding='utf8')

In [40]:
X_test = preprocessing_pipeline(test_data)
X_test_d2v = d2v(model, X_test['text'], infer=True)

100%|██████████| 2513/2513 [00:06<00:00, 365.16it/s]


In [41]:
predictions = pd.DataFrame(clf.predict_proba(X_test_d2v),
                           index=X_test.index)
predictions = predictions.drop(columns=[0])
predictions.columns = selected_classes

# We set a low threshold in order to maximize the recall
predictions = predictions.applymap(lambda x: int(x>0.2))

In [42]:
X_y_test = pd.concat([test_data['text'], predictions], axis=1)
X_y_test = X_y_test.fillna(0)
X_y_test['issue'] = X_y_test.iloc[:, 1:].apply(lambda x: int(x.any()), axis=1)

In [43]:
# Preview of the results
mask = X_y_test.iloc[:, 2:].apply(lambda x: x.any(), axis=1)
X_y_test[mask]

Unnamed: 0,text,screen,software_bugs,locking_system,system,apps_update,battery_life_charging,customerservice,issue
32,screen slippery hard to hold due to curved edg...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
63,the screen looks so much smaller when you firs...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
69,my friends week old s8 has already found a poi...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
79,i wasnt sure id like the curved edge but ive g...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
120,is lighter and the screen resolution is much b...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
146,end it has everything you want to head and it ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
158,the curved glass has taken me awhile to get us...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
173,less than 2 weeks in and theres a small scratc...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
183,it feels fragile in my hand it probably isnt r...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
202,it streams shows easily to my smart tv,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [44]:
# Save to disk
X_y_test.to_csv('../data/test_final.csv')
X_y_test[mask].to_csv('../data/test_issues_final.csv')

### Prediction on the unlabeled data

In [54]:
predictions = pd.DataFrame(clf.predict_proba(X_unlbl_d2v),
                           index=X_unlbl.index)
predictions = predictions.drop(columns=[0])
predictions.columns = selected_classes

# We set a low threshold in order to maximize the recall
predictions = predictions.applymap(lambda x: int(x>0.2))

In [68]:
X_y_unlbl = pd.concat([unlabeled_data['text'], predictions], axis=1)
X_y_unlbl = X_y_unlbl.fillna(0)
X_y_unlbl['issue'] = X_y_unlbl.iloc[:, 1:].apply(lambda x: int(x.any()), axis=1)

In [69]:
counts = X_y_unlbl.loc[X_y_unlbl['issue'] == 1, :].iloc[:, 1:-1].apply(pd.value_counts)

In [70]:
for col, values in counts.iteritems():
    print('Feature category: {}'.format(col))
    for i, v in values.iteritems():
        print("{} - {}".format("No" if i == 0 else "Yes", v))

Feature category: screen
No - 1336
Yes - 4305
Feature category: software_bugs
No - 5577
Yes - 64
Feature category: locking_system
No - 5056
Yes - 585
Feature category: system
No - 5502
Yes - 139
Feature category: apps_update
No - 5597
Yes - 44
Feature category: battery_life_charging
No - 5041
Yes - 600
Feature category: customerservice
No - 5598
Yes - 43


In [71]:
mask = X_y_unlbl.iloc[:, 1:-1].apply(lambda x: x.any(), axis=1)
X_y_unlbl[mask]

Unnamed: 0,text,screen,software_bugs,locking_system,system,apps_update,battery_life_charging,customerservice,issue
52,take the screen ratio into account and do a go...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
59,i just wish they kept the led notice lights wh...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
61,you should download the google keyboard i kep...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
71,a lightning cable is only useful within the ti...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
109,the phone is all i expected and more out of t...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
117,id rather have the larger screen and resolutio...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
131,going to miss the home button at the bottom of...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
142,designed with the most durable glass ever in a...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
143,designed with the most durable glass ever in a...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1
157,it look a few uses to get use to the way this ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [72]:
X_y_unlbl.to_csv('../data/propagation_unlbl_final.csv')
X_y_unlbl[mask].to_csv('../data/unlbl_issues_final.csv')