In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# 1) Importations

In [None]:
%pylab inline
import time
import pandas as pd
import os
import seaborn as sns
import math

In [None]:
def display_progress_bar(n_tot, n_prog):
    from time import sleep
    progress = math.ceil((n_prog + 1) * 100 / n_tot)
    sys.stdout.write('\r')
    sys.stdout.write("[%-100s] %d%%" %('='*progress, progress))
    sys.stdout.flush()
    sleep(0.25)

In [None]:
def missing_values(df):
    # Looking for missing values
    missing_val_df = df.isnull().sum(axis=0).reset_index()
    missing_val_df.columns = ['feature', 'missing values']
    missing_val_df['missing values (%)'] = 100 - ((df.shape[0] - missing_val_df['missing values']) / df.shape[0] * 100)
    missing_val_df = missing_val_df.sort_values('missing values (%)', ascending=False)
    missing_val_df
    display(missing_val_df)
    
    return missing_val_df

In [None]:
def replace_tag_synonym(source_tag):
    """
    Short function to replace a tag with its synonyms
    """
    synonyms_dict = dict(zip(tags_synonyms.SourceTagName.values, tags_synonyms.TargetTagName.values))
    
    if source_tag in synonyms_dict.keys():
        replaced_tag = synonyms_dict[source_tag]
    else:
        replaced_tag = source_tag
    return replaced_tag

In [None]:
def join_tags_minus_nans(row):
    row_clean = row.dropna()
    joined_row = "/".join(row_clean)
    return joined_row

In [None]:
def count_tags(data):
    count_df = pd.Series(data.loc[:, tags_features].squeeze().values.ravel()).value_counts()
    ct_df = pd.DataFrame({'Tag': count_df.index,
                                  'Count': count_df.values,
                                  'Prcentage (%)': (100 * (count_df / count_df.sum())).values})
    return ct_df

# 2) Gathering data

In [None]:
# Getting current path
path = os.getcwd()
data_files = [file for file in os.listdir(path + '/data') if file.startswith('QueryResults')]

Loop on files and construct main dataframe

In [None]:
try :
    data = pd.DataFrame()
    print("Loading questions full dataset")
    data = pd.read_csv(path + "/data/data_questions.csv", sep=',')
except :
    # Loading first file to get columns names
    file = "QueryResults6.csv"
    try :
        data_col_names = pd.read_csv(path + "/data/" + file, sep=',')
    except FileNotFoundError :
        print("Please check if the file %s is in the 'data' folder at the current location" % file)
    data_columns = data_col_names.columns
    # Save memory
    del data_col_names
    # Initialise main df
    data = pd.DataFrame(columns=data_columns)
    # Loop over separate files to build main dataframe
    for file in data_files :
        print("Treating file : %s" % file)
        # Verifying data presence
        try :
            data_temp = pd.read_csv(path + "/data/" + file, sep=',')
        except FileNotFoundError :
            print("Please check if the file %s is in the 'data' folder at the current location" % file)

        # Save data
        data = data_raw.append(data_temp)

    # Save data
    print("Saving")
    data.to_csv("data/data_questions.csv", index=False)

In [None]:
data.columns

In [None]:
try :
    print("Loading Tags Synonyms dataset")
    tags_synonyms = pd.read_csv(path + "/data/Tags_Synonyms.csv", sep=',')
except :
    print("Please check if the file 'Tags_Synonyms' is in the 'data' folder at the current location")

Loading main dataframe, pre-computed

In [None]:
try :
    print("Loading clean questions full dataset")
    data = pd.read_csv(path + "/data/data_questions_clean.csv", sep=',')
    print("Clean questions full dataset loaded")
except :
    print("The 'data_questions_clean.csv' file is not in the 'data' folder")

Having a look at our data

In [None]:
data.head()

# 3) Cleaning

Keeping only features of interest

In [None]:
data_raw = data.loc[:, ['Body', 'Title', 'Tags']]

## 3.1) Duplicates and missing values

In [None]:
# Getting rid of the duplicates
print("initial shape : ", data_raw.shape)
dup = data_raw[data_raw.duplicated()].shape[0]
if dup > 0 :
    print("duplicates found : ", dup)
    data_raw = data_raw.drop_duplicates(keep='first')
    print("Shape without duplicates: ", data_raw.shape)
else :
    print("No duplicate")

In [None]:
# Looking for missing values
mv_df = missing_values(data_raw)

As *'Tags'* will be our main concern to build a tags prediction tool, we delete those missing Tags values

In [None]:
tags_features = ['Tag_1', 'Tag_2', 'Tag_3', 'Tag_4', 'Tag_5']
interesting_features = ['Body', 'Title', 'clean_body', 'Tags', 'TitleBody', 'clean_title_body']

In [None]:
data_raw = data_raw.dropna(subset=['Tags'])
print("New shape : ", data_raw.shape)
try:
    data_raw = data_raw.dropna(subset=['New_Tags_syn'])
    print("New shape : ", data_raw.shape)
except KeyError:
    pass
try:
    data_raw = data_raw.dropna(subset=tags_features, how='all')
    print("New shape : ", data_raw.shape)
except KeyError:
    pass

We will use Body and Title to train our models, delete rows with missing values

In [None]:
data_raw = data_raw.dropna(subset=['Body', 'Title'])
print("New shape : ", data_raw.shape)

## 3.2) Feature engineering

Body and Title may both contains interesting clues, we will concatenate those into one new string

In [None]:
data_raw['TitleBody'] = data_raw.Title + data_raw.Body

In [None]:
data_raw.loc[:, ['Body', 'Title', 'Tags', 'TitleBody']].head()

Removing Tags chevrons

In [None]:
data_raw['New_Tags'] = data_raw.Tags.apply(lambda x: x.strip('<').strip('>').replace('>', '').replace('<', '/'))

Counting Tags

In [None]:
data_raw['n_Tags'] = data_raw.New_Tags.apply(lambda x: len(x.split('/')))

Separating tags in indiviuals features

In [None]:
tags_lists = data_raw.New_Tags.apply(lambda x: x.split('/')).values

In [None]:
# Initialise new list of tags
filled_tags_list = []
# Loop over lists of tags
for inner_list in tags_lists:
    # Get list length
    length = len(inner_list)
    # While length not equal to 5 append nans
    while length < 5:
        inner_list.append(np.nan)
        length = len(inner_list)
    # Add extended list to new list
    filled_tags_list.append(inner_list)

Convert lists of tags into dataframe

In [None]:
tags_df = pd.DataFrame(filled_tags_list)
# Remove empty label
tags_df = tags_df.drop(labels=5, axis=1)
tags_df.index = data_raw.index
tags_df.columns = tags_features

In [None]:
# Add separated tags to dataframe
data_raw = pd.concat((data_raw, tags_df), axis=1)

Looking for tags that can be replaced with synonyms

Before removing synonyms :

In [None]:
temp_list = [x.split('/') for x in data_raw.New_Tags.values.tolist()]
tags_list = [y for x in temp_list for y in x]
unique_tags = set(tags_list)
print("Total of %i unique Tags" % len(unique_tags))

Look for synonyms

In [None]:
tags_syns_in_tags = []
for sourcetag in tags_synonyms.SourceTagName:
    if sourcetag in unique_tags :
        tags_syns_in_tags.append(sourcetag)
print("%i Tags are synonyms and can be replaced" % len(tags_syns_in_tags))

Replace tags that can be replaced (time consuming)

In [None]:
for tag_feature in tags_features :
    data_raw.loc[:, tag_feature] = data_raw.loc[:, tag_feature].apply(replace_tag_synonym)

Save modified raw data

In [None]:
data_raw['New_Tags_syn'] = data_raw.loc[:, tags_features].apply(join_tags_minus_nans, axis=1)

After removing synonyms :

In [None]:
temp_list = [x.split('/') for x in data_raw.New_Tags_syn.values.tolist()]
tags_list = [y for x in temp_list for y in x]
unique_tags_syn = list(set(tags_list))
# Remove nan
for value in unique_tags_syn:
    try:
        if np.isnan(value):
            unique_tags_syn.remove(value)
    except:
        pass
print("Total of %i unique Tags" % len(unique_tags_syn))

Saving processed data

In [None]:
data_raw.to_csv("data/data_questions_clean.csv", index=False)

## 3.3) Exploration

Questions seem to have one or multiple tags, investigating this

Get all tags combinations

In [None]:
tags_comb_counts = data_raw.loc[:, 'New_Tags_syn'].value_counts()
print("Most popular tag combinations :")
display(tags_comb_counts.head())
print("Less popular tag combinations :")
display(tags_comb_counts.tail())
print("Total of %i Tags combinations" % tags_comb_counts.shape[0])

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
plot = sns.barplot(tags_comb_counts[:10].index.values, tags_comb_counts[:10].values)
plot.set_xticklabels(plot.get_xticklabels(), rotation=45, fontsize=12)
ax.set_title("10 Most popular Tags combinations", fontsize=12)
ax.set_ylabel("Count", fontsize=12);

How many Tags by questions ?

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sns.distplot(data_raw.n_Tags, kde=False, bins=10)
ax.set_xlabel("Number of Tags by question", fontsize=12)
ax.set_ylabel("Count", fontsize=12)
ax.set_title("Distribution of number of Tags by question", fontsize=12);

Mostly 2 and 3 tags over a maximum of 5 are attributed to questions

### Recovering and counting all individuals Tags

Counting Tags in Dataframe in order to find most popular Tags

In [None]:
count_tags_df = count_tags(data_raw)
count_tags_df.head()

In [None]:
most_popular_tag = count_tags_df.Tag[0]

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
plot = sns.barplot(x='Tag', y='Count', data=count_tags_df[:10])
plot.set_xticklabels(plot.get_xticklabels(), rotation=45, fontsize=12)
ax.set_title("10 Most popular Tags", fontsize=12);