## Datacamp CapGemini
#### Group 7

Aims : classification

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Import data

In [None]:
labeled_data = pd.read_csv('../data/labeled_data.csv', encoding='utf8')
unlabeled_data = pd.read_csv('../data/data_unlabeled.csv', engine='c', encoding='utf8')

In [None]:
labeled_data["text"].sample(20)

In [None]:
unlabeled_data["text"].sample(10)

In [None]:
selected_classes = ['screen', 'software_bugs', 'locking_system', 'system', 'apps_update', 'battery_life_charging', 'customerservice']
labeled_data.iloc[:, 1:].loc[:, selected_classes].apply(pd.value_counts)

## Feature creation

We start by defining our preprocessing pipeline : 

### Preprocessing pipeline

#### Pre-processing and tokenization

In [None]:
import regex
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords as stpwrds

In [None]:
def preprocess_text(df):
    # Remove ponctuation
    matrix = str.maketrans(",\"_;", "    ", "'’.()/-?!|:><&[]*=@%^â€™")
    df["text"] = df["text"].transform(lambda x: x.translate(matrix))
    
    # Remove bad characters
    df["text"] = df["text"].transform(lambda text: ''.join([x for x in text if ord(x)<128 or ord(x)!=25]))
    
    # Remove hashtags
    df["text"] = df["text"].map(lambda x: regex.sub('#[a-zA-Z0-9-]*', '', x))
    
    # Remove number only strings
    numbers = regex.compile('^[0-9 ]+$')
    mask = df["text"].map(lambda x: not numbers.match(x))
    df = df[mask]
    
    mask = df["text"].map(lambda x: x.strip() == '')
    df = df.loc[~mask]
    
    return df

In [None]:
def tokenize(df):
    # Tokenize
    tweet = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    stopwords = set(stpwrds.words('english'))

    df["text"] = df["text"].transform(tweet.tokenize)
    
    # Remove stopwords
    df["text"] = df["text"].transform(lambda x: [token for token in x if not token in stopwords])
    
    # Remove tokens with only numbers
    numbers = regex.compile('^[0-9]{3,}$')
    df["text"] = df["text"].map(lambda x: [token for token in x if not numbers.match(token)])
    
    # Number of tokens
    df["length"] = df["text"].apply(lambda x: len(x))
    return df

In [None]:
# Detect language
import langdetect

def detect_lang(x):
    try: 
        return langdetect.detect(x)
    except Exception as e:
        return None

#### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
def spacy_lemmatize(tokens, nlp):
    doc = nlp(" ".join(tokens))
    return [token.lemma_.lower().strip() for token in doc if token.lemma_ != "-PRON-"]

#### Custom lemmatization

In [None]:
# Custom tokens
def custom_lemmatize(tokens):
    processed = []
    extend = processed.extend
    length = len(tokens)
    
    for i, token in enumerate(tokens):
        # iPhones
        if token == "x" or token == "10":
            result = ["10"]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token in ["6", "7", "8"]:
            result = [token]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token == "+":
            extend(["plus"])
        extend([token])
        
        # Samsung
        if token == "s8":
            result = ["S8"]
            if i>0 and tokens[i-1] != "samsung":
                result.insert(0, "samsung")
            extend(result)
            continue
        
    return processed

#### Bigrams

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
def bigrams(column):
    phrases = Phrases(column.values.tolist())
    bigrams = Phraser(phrases)
    return list(bigrams[column.values.tolist()])

#### Final function

In [None]:
def preprocessing_pipeline(df, threshold=4):
    df = preprocess_text(df)
    df = tokenize(df)
    df['text'] = df['text'].progress_map(lambda x: spacy_lemmatize(x, nlp))
    df['text'] = df['text'].apply(custom_lemmatize)
    df['text'] = bigrams(df['text'])
    df['text'] = df['text'].apply(lambda x: [tk for tk in x if tk])
    df = df[df['text'].map(lambda x: len(x) > threshold)]
    return df

### Word/sentence encoding

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_lbl_train, X_lbl_test = train_test_split(labeled_data, test_size=0.2, random_state=42, stratify=labeled_data['issue'])

In [None]:
counts = X_lbl_train.iloc[:, 1:].loc[:, selected_classes].apply(pd.value_counts)
for cat, values in counts.iteritems():
    print("{} - {:.2%} positives".format(cat, values[1]/values[0]))

In [None]:
X_lbl_train = preprocessing_pipeline(X_lbl_train)
X_lbl_test = preprocessing_pipeline(X_lbl_test)
y_lbl_train = X_lbl_train['issue']
y_lbl_test = X_lbl_test['issue']

#### Doc2Vec (what we use)

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [None]:
def get_docs(sources):
    for label, source in sources.items():
        for i, tokens in source.iteritems():
            yield TaggedDocument(words=tokens, tags=["{}_{}".format(label, i)])

In [None]:
# Train with train data
sources = {
    'TRAIN': X_lbl_train["text"]
}

reviews = list(get_docs(sources))

In [None]:
model = Doc2Vec(size=500, window=15, min_count=10, workers=10)
model.build_vocab(reviews)
model.train(reviews, epochs=20, total_examples=model.corpus_count)

In [None]:
def d2v(model, data, infer=True): 
    if infer:
        vectors = [model.infer_vector(review) for review in data.values.tolist()]
    else:
        vectors = [model["TRAIN_{}".format(i)] for i in data.index.values.tolist()]
        
    return pd.DataFrame(vectors, index=data.index, columns=["dim_{}".format(i) for i in range(model.vector_size)])

In [None]:
X_lbl_train_d2v = d2v(model, X_lbl_train['text'], infer=False)
X_lbl_test_d2v = d2v(model, X_lbl_test['text'], infer=True)

#### TF-IDF

In [None]:
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Get vocabulary
vocab = list(
    set(itertools.chain(*X_lbl_train['text'].tolist()))|
    set(itertools.chain(*X_lbl_test['text'].tolist()))|
    set(itertools.chain(*X_unlbl['text'].tolist()))
)
vocab_dict = dict((y, x) for x, y in enumerate(vocab))

In [None]:
model = TfidfVectorizer(ngram_range=(1,3), use_idf=True, vocabulary=vocab_dict)

In [None]:
X_lbl_train = model.fit_transform(X_lbl_train['text'].apply(lambda x: " ".join(x)).values.tolist())
X_lbl_test = model.transform(X_lbl_test['text'].apply(lambda x: " ".join(x)).values.tolist())
X_unlbl = model.transform(X_unlbl['text'].apply(lambda x: " ".join(x)).values.tolist())

### Semi-supervised learning

#### XGBoost classification without label propagation

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [None]:
clf = XGBClassifier(max_depth=20, 
                    n_estimators=750, 
                    min_child_weight=3, 
                    scale_pos_weight=2,
                    learning_rate=0.05, 
                    max_delta_step=0.5,
                    gamma=0.01,
                    colsample_bytree=0.8,
                    n_jobs=-1)

In [None]:
clf.fit(X_lbl_train_d2v, y_lbl_train)

In [None]:
print(classification_report(y_lbl_test, clf.predict(X_lbl_test_d2v)))

#### Prediction on unlabeled data (WIP - not used for the final prediction due to poor results)

First we use our classifier to predict issues on the unlabeled data : 

In [None]:
X_unlbl = preprocessing_pipeline(unlabeled_data)

In [None]:
X_unlbl_d2v = d2v(model, X_unlbl['text'], infer=True)

In [None]:
issue_predictions = pd.DataFrame(clf.predict_proba(X_unlbl_d2v)[:, 1], index=X_unlbl_d2v.index)

unlabeled_data['xgb_issue_prob'] = issue_predictions
# Drop rows if no issue proba
unlabeled_data = unlabeled_data.dropna(subset=['xgb_issue_prob'])
# Predict binary output according to a threshold
unlabeled_data['xgb_issue'] = unlabeled_data['xgb_issue_prob'].apply(lambda x: x > 0.4).map(int)

In [None]:
unlabeled_data['xgb_issue'].value_counts()

Then we perform sentiment analysis in order to detect false positives :

In [None]:
from textblob import TextBlob

In [None]:
def sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

In [None]:
unlabeled_data['sentiment'] = unlabeled_data["text"].progress_map(sentiment)

We cross results to spot weird matches :

In [None]:
def combine_preds(row, prob_col_name="xgb_issue", sentiment_col_name="sentiment"):
    return int(row[prob_col_name] == 1 and row[sentiment_col_name] <= 0)

unlabeled_data['issue'] = unlabeled_data.apply(combine_preds, axis=1)

In [None]:
unlabeled_data['issue'].value_counts()

#### Retrain

In [None]:
clf_all = XGBClassifier(max_depth=8, 
                        n_estimators=750, 
                        min_child_weight=3, 
                        scale_pos_weight=2,
                        learning_rate=0.05, 
                        max_delta_step=0.5,
                        gamma=0.01,
                        n_jobs=-1)

In [None]:
X_all = pd.concat([X_lbl_train_d2v, X_unlbl_d2v])
y_all = pd.concat([y_lbl_train, unlabeled_data['issue']])

In [None]:
clf.fit(X_all, y_all)

In [None]:
print(classification_report(y_lbl_test, clf.predict(X_lbl_test_d2v)))

### Hackathon : multi-~~label~~ class problem

We decided to reduce the classification problem to a multiclass one :

In [None]:
selected_classes = ['screen', 'software_bugs', 'locking_system', 'system', 'apps_update', 'battery_life_charging', 'customerservice']

We create a new `issue` indicator column on the class subset and merge all classes dummy variables :

In [None]:
X_lbl_train['issue'] = X_lbl_train.loc[:, selected_classes].apply(lambda x: int(x.any()), axis=1)
X_lbl_test['issue'] = X_lbl_test.loc[:, selected_classes].apply(lambda x: int(x.any()), axis=1)

In [None]:
def encode_class(row):
    if row['issue'] == 1:
        issue_name = row.idxmax(1)
        return selected_classes.index(issue_name)+1
    return 0

In [None]:
y_lbl_train = X_lbl_train.loc[:, selected_classes + ['issue']].apply(encode_class, axis=1)
y_lbl_test = X_lbl_test.loc[:, selected_classes + ['issue']].apply(encode_class, axis=1)

We train the classifier on this categorical vector :

In [None]:
clf = XGBClassifier(max_depth=8, 
                    n_estimators=750, 
                    objective='multi:softprob',
                    min_child_weight=3, 
                    scale_pos_weight=2,
                    learning_rate=0.05, 
                    max_delta_step=0.5,
                    gamma=0.01,
                    n_jobs=-1)

In [None]:
clf.fit(X_lbl_train_d2v, y_lbl_train)

In [None]:
print(classification_report(y_lbl_test, clf.predict(X_lbl_test_d2v)))

### Prediction on test data

In [None]:
test_data = pd.read_csv('../data/test_data.csv', encoding='utf8')

In [None]:
X_test = preprocessing_pipeline(test_data)
X_test_d2v = d2v(model, X_test['text'], infer=True)

In [None]:
predictions = pd.DataFrame(clf.predict_proba(X_test_d2v),
                           index=X_test.index)
predictions = predictions.drop(columns=[0])
predictions.columns = selected_classes

# We set a low threshold in order to maximize the recall
predictions = predictions.applymap(lambda x: int(x>0.2))

In [None]:
X_y_test = pd.concat([test_data['text'], predictions], axis=1)
X_y_test = X_y_test.fillna(0)
X_y_test['issue'] = X_y_test.iloc[:, 1:].apply(lambda x: int(x.any()), axis=1)

In [None]:
# Preview of the results
mask = X_y_test.iloc[:, 2:].apply(lambda x: x.any(), axis=1)
X_y_test[mask]

In [None]:
# Save to disk
X_y_test.to_csv('../data/test_final.csv')
X_y_test[mask].to_csv('../data/test_issues_final.csv')

### Prediction on the unlabeled data

In [None]:
predictions = pd.DataFrame(clf.predict_proba(X_unlbl_d2v),
                           index=X_unlbl.index)
predictions = predictions.drop(columns=[0])
predictions.columns = selected_classes

# We set a low threshold in order to maximize the recall
predictions = predictions.applymap(lambda x: int(x>0.2))

In [None]:
X_y_unlbl = pd.concat([unlabeled_data['text'], predictions], axis=1)
X_y_unlbl = X_y_unlbl.fillna(0)
X_y_unlbl['issue'] = X_y_unlbl.iloc[:, 1:].apply(lambda x: int(x.any()), axis=1)

In [None]:
counts = X_y_unlbl.loc[X_y_unlbl['issue'] == 1, :].iloc[:, 1:-1].apply(pd.value_counts)

In [None]:
for col, values in counts.iteritems():
    print('Feature category: {}'.format(col))
    for i, v in values.iteritems():
        print("{} - {}".format("No" if i == 0 else "Yes", v))

In [None]:
mask = X_y_unlbl.iloc[:, 1:-1].apply(lambda x: x.any(), axis=1)
X_y_unlbl[mask]

In [None]:
X_y_unlbl.to_csv('../data/propagation_unlbl_final.csv')
X_y_unlbl[mask].to_csv('../data/unlbl_issues_final.csv')