## Datacamp CapGemini
#### Group 7

Aims : preprocessing and topic modeling

In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Import data

In [None]:
raw_data = pd.read_csv('../data/data_scraping_V2.csv', engine='python')

In [None]:
# Remove ill-formated lines
raw_data = raw_data[raw_data.text.notnull()]
# Remove empty lines
mask = raw_data["text"].map(lambda x: x.strip() == '')
raw_data = raw_data.loc[~mask]

In [None]:
raw_data.source.value_counts()

In [None]:
# Remove youtube and twitter
excluded_sources = ["youtube", "twitter"]
data = raw_data[raw_data["source"].map(lambda x: x not in excluded_sources)].copy()

### Processing

#### First steps

In [None]:
# Remove ponctuation
matrix = str.maketrans(",\"_;", "    ", "'’.()/-?!|:><&[]*=@%^")
data["text"] = data["text"].transform(lambda x: x.translate(matrix))

In [None]:
# Remove bad characters
data["text"] = data["text"].transform(lambda text: ''.join([x for x in text if ord(x)<128]))

In [None]:
# Regex-based processing
import regex

# Remove hashtags
data["text"] = data["text"].map(lambda x: regex.sub('#[a-zA-Z0-9-]*', '', x))
# Remove number only strings
numbers = regex.compile('^[0-9 ]+$')
mask = data["text"].map(lambda x: not numbers.match(x))
data = data[mask]

In [None]:
# Remove empty lines
mask = data["text"].map(lambda x: x.strip() == '')
data = data.loc[~mask]

In [None]:
# Detect language
import langdetect
def detect_lang(x):
    try: 
        return langdetect.detect(x)
    except Exception as e:
        return None

data["lang"] = data["text"].progress_map(detect_lang)

In [None]:
# Remove reviews for which lang detect failed
data = data[~data["lang"].isnull()]
# Only keep english comments
data = data[data["lang"] == "en"]

#### Tokenize

In [None]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tweet = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stopwords = set(stopwords.words('english'))

# Tokenize
data["text"] = data["text"].transform(tweet.tokenize)

In [None]:
# Remove stopwords
data["text"] = data["text"].transform(lambda x: [token for token in x if not token in stopwords])

In [None]:
# Remove tokens with only numbers
numbers = regex.compile('^[0-9]{3,}$')
data["text"] = data["text"].map(lambda x: [token for token in x if not numbers.match(token)])

In [None]:
# Remove short lines
data = data[data["text"].apply(lambda x: len(x) > 5)]

In [None]:
# Checkpoint
data.groupby("source").head(10)

### Stemming

##### First attempt

In [None]:
from nltk.stem.snowball import SnowballStemmer

In [None]:
stemmer = SnowballStemmer('english')

def stemming(tokens):
    excluded = set(['iphone'])
    return [stemmer.stem(token) if token not in excluded else token for token in tokens]

In [None]:
data["review_text"].transform(stemming)

### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def lemming(tokens):
    return [lemmatizer.lemmatize(token, pos="v") for token in tokens]

In [None]:
data["text"] = data["text"].transform(lemming)

In [None]:
data.groupby("source").head(10)

### Token cleaning

In [None]:
data["text"].sample(50).map(lambda x: print(" ".join(x)))

In [None]:
# Custom tokens
def custom_lemming(tokens):
    processed = []
    extend = processed.extend
    length = len(tokens)
    
    for i, token in enumerate(tokens):
        # iPhones
        if token == "x" or token == "10":
            result = ["10"]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token in ["6", "7", "8"]:
            result = [token]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token == "+":
            extend(["plus"])
        extend([token])
        
        # Samsung
    return processed

In [None]:
data["text"] = data["text"].transform(custom_lemming)

In [None]:
# Get bigrams
from gensim.models.phrases import Phrases, Phraser

phrases = Phrases(data["text"].values.tolist())
bigram = Phraser(phrases)

data["bigrams"] = list(bigram[data["text"].values.tolist()])

In [None]:
data["bigrams"].sample(50)

### TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=70, norm='l2', min_df=2, max_df=0.8, ngram_range=(1, 3))

In [None]:
tfidf_matrix = tfidf.fit_transform(data["bigrams"].transform(lambda x: ' '.join(x)).tolist())

In [None]:
print("Features : {}".format(", ".join(tfidf.get_feature_names())))

In [None]:
pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names()).replace(0, '')

### NMF

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=10, alpha=.1, l1_ratio=.5).fit(tfidf_matrix)

In [None]:
# tdidf = T * H
# H maps documents (articles) into new dimensions (in the case of NMF, we can interpret these as topics)
# W maps words to new dimensions
T = nmf.fit_transform(tfidf_matrix)
W = nmf.components_

In [None]:
pd.DataFrame(W)

In [None]:
top = 10
features = tfidf.get_feature_names()

for i, dimension in enumerate(W):
    print("Topic #{}".format(i+1))
    feature_indexes = dimension.argsort()[:-top:-1]
    print("Words : {}".format(", ".join([features[i] for i in feature_indexes])))

### LDA

In [None]:
from gensim import models, corpora

In [None]:
# Create corpora dictionary
tokens_dict = corpora.Dictionary(data["bigrams"].values.tolist())
print(tokens_dict)

In [None]:
# Filter extremes
tokens_dict.filter_extremes(no_below=3, no_above=0.7)
print(tokens_dict)

In [None]:
# Create corpus
corpus = [tokens_dict.doc2bow(review) for review in data["text"].values.tolist()]
print(corpus[:3])

In [None]:
# Run the LDA (computation time should be between 5 to 60 seconds)

# choose the number of topics => to find a "good" number of topics, try multiple values and see which one is the best
# optionally: input alpha and eta to influence how topics are distributed across documents, 
#  and how words are distributed across topics
#  the syntax is the following
#  alpha is a vector of size the number of documents, and eta's size is the number of words
#  alpha = [0.01] * id2word_newspaper.num_docs for instance
#  eta = [0.01] * len(id2word_newspaper.keys())

num_topics = 40

# Below without alpha nor eta
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=tokens_dict, passes=4)

# Below with alpha and eta
# %time lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word_newspaper, passes=4, 
#                                   alpha=[0.01] * id2word_newspaper.num_docs, eta = [0.01] * len(id2word_newspaper.keys()))

In [None]:
lda_model.show_topics(num_topics=num_topics, num_words=8, formatted=True)

# Graph of words

Graph of words use words that are neighbors in sentences. <br>For instance, in the sentence "Graph of words use words that are neighbors in sentences", the table below describes the neighbors

| word_1 | word_2 |
|-|-|
| Graph | of |
| of | words |
| words | use |
| use | words |
| words | that |
| that | are |
| are | neighbors |
| neighbors | in |
| in | sentences |

We'll handle different things as well:
- keeping only nouns
- using words that are the 2nd neighbors (neighbor of neighbor)

In [None]:
import networkx as nx # to analyse graphs in Python

In [None]:
# We aggregate data from ALL the comments (in our cleaned dataframe)
# And take the words (tokens) that are nouns
clean_text = df_try.noun_tokens.tolist()

# The functions below will help us build the dataframe of words that are neighbors
def clean_stop_words_in_dataframe(df, stop_words):
    idx_1 = df.loc[df[df.columns[0]].isin(stop_words)].index
    idx_2 = df.loc[df[df.columns[1]].isin(stop_words)].index
    return df.loc[~(df.index.isin(idx_1.append(idx_2)))]

def word_neighbors(dist):
    return clean_stop_words_in_dataframe(
        pd.concat([pd.DataFrame([clean_sentence[:-dist], clean_sentence[dist:]]).T for clean_sentence in clean_text]) \
        .rename(columns={0:'w0', 1:'w1'}).reset_index(drop=True), stop_words=STOPWORDS)

In [None]:
clean_text[0] # nouns of the first comment

In [None]:
# This creates a huge table of all the words that are neighbors and 2nd-order neighbors
# For neighbors we use weight = 2, for 2nd-order neighbors we use weight = 1
data_graph_of_words = word_neighbors(1).assign(weight=2).append(word_neighbors(2).assign(weight=1))

In [None]:
data_graph_of_words.head()

In [None]:
# We sum the weights for all combinations of neighbors
data_graph_of_words = data_graph_of_words.groupby(['w0', 'w1']).weight.sum().reset_index()

In [None]:
# nx.__version__ is 2.1
# If you have previous versions, the function might be nx.from_pandas_dataframe()
graph_of_words = nx.from_pandas_edgelist(data_graph_of_words, source='w0', target='w1', edge_attr='weight', 
                                          create_using=nx.Graph())

In [None]:
# We select the words that are neighbors (and 2nd-order neighbors) of the word "problem"
graph_of_words_center = nx.ego_graph(graph_of_words, n='problem', radius=1)
print(graph_of_words_center.size())
print(len(graph_of_words_center))

In [None]:
# Which words are the most connected to "problem"?
# Degree is the weight
pd.DataFrame.from_dict([dict(graph_of_words_center.degree(graph_of_words_center.nodes, weight='weight'))]) \
    .T.rename(columns={0:'degree'}).reset_index().rename(columns={'index':'word'}).sort_values('degree', ascending=False)

In [None]:
# Draw the graph as it is
nx.draw(graph_of_words_center, node_size=20)
# It doesn't give us a lot of information, except that many words connected to "problem" are connected together
# (there's more than one line for each red dot)

In [None]:
# We can use PageRank algorithm to see if some words are more connected to others
pagerank = pd.DataFrame.from_dict([nx.pagerank(G=graph_of_words, alpha=0.99)]).T.rename(columns={0:'pagerank'})

In [None]:
# It confirms what we had with LDA: "phone", "screen", "iphone"... are connected to too many words
pagerank.sort_values('pagerank', ascending=False)

In [None]:
# Let's group words into communities, and see if it makes sense in terms of topics
# The code is taken from the link below
# https://stackoverflow.com/questions/43541376/how-to-draw-communities-with-networkx
def community_layout(g, partition):
    """
    Compute the layout for a modular graph.


    Arguments:
    ----------
    g -- networkx.Graph or networkx.DiGraph instance
        graph to plot

    partition -- dict mapping int node -> int community
        graph partitions


    Returns:
    --------
    pos -- dict mapping int node -> (float x, float y)
        node positions

    """

    pos_communities = _position_communities(g, partition, scale=3.)

    pos_nodes = _position_nodes(g, partition, scale=1.)

    # combine positions
    pos = dict()
    for node in g.nodes():
        pos[node] = pos_communities[node] + pos_nodes[node]

    return pos

def _position_communities(g, partition, **kwargs):

    # create a weighted graph, in which each node corresponds to a community,
    # and each edge weight to the number of edges between communities
    between_community_edges = _find_between_community_edges(g, partition)

    communities = set(partition.values())
    hypergraph = nx.DiGraph()
    hypergraph.add_nodes_from(communities)
    for (ci, cj), edges in between_community_edges.items():
        hypergraph.add_edge(ci, cj, weight=len(edges))

    # find layout for communities
    pos_communities = nx.spring_layout(hypergraph, **kwargs)

    # set node positions to position of community
    pos = dict()
    for node, community in partition.items():
        pos[node] = pos_communities[community]

    return pos

def _find_between_community_edges(g, partition):

    edges = dict()

    for (ni, nj) in g.edges():
        ci = partition[ni]
        cj = partition[nj]

        if ci != cj:
            try:
                edges[(ci, cj)] += [(ni, nj)]
            except KeyError:
                edges[(ci, cj)] = [(ni, nj)]

    return edges

def _position_nodes(g, partition, **kwargs):
    """
    Positions nodes within communities.
    """

    communities = dict()
    for node, community in partition.items():
        try:
            communities[community] += [node]
        except KeyError:
            communities[community] = [node]

    pos = dict()
    for ci, nodes in communities.items():
        subgraph = g.subgraph(nodes)
        pos_subgraph = nx.spring_layout(subgraph, **kwargs)
        pos.update(pos_subgraph)

    return pos

In [None]:
# to install networkx 2.0 compatible version of python-louvain use:
# pip install -U git+https://github.com/taynaud/python-louvain.git@networkx2
from community import community_louvain

In [None]:
pd.DataFrame(list(G['issue'].items())).rename(columns={0:'word', 1:'weight_attr'}) \
    .assign(weight = lambda df: df.weight_attr.map(lambda cell: cell['weight'])) \
    .drop(['weight_attr'], axis=1) \
    .sort_values('weight', ascending=False)

In [None]:
# Communities around the word "problem"
# To save picture, right click on the picture and select "Save image as..."
matplotlib.rcParams['figure.figsize'] = (40, 40)
G=nx.ego_graph(G=graph_of_words, radius=1, n='problem')
partition = community_louvain.best_partition(G)
pos = community_layout(g=G, partition=partition)
nx.draw(G, pos, node_color=list(partition.values()), 
        labels=dict((n,n) for n,d in G.nodes(data=True)), font_color='black', font_size=8, font_weight='bold',
       edge_color='lightgray')

In [None]:
# Around the word "issue"
G=nx.ego_graph(G=graph_of_words, radius=1, n='issue')
partition = community_louvain.best_partition(G)
pos = community_layout(g=G, partition=partition)
matplotlib.rcParams['figure.figsize'] = (40, 40)
nx.draw(G, pos, node_color=list(partition.values()), 
        labels=dict((n,n) for n,d in G.nodes(data=True)), font_color='black', font_size=8, font_weight='bold',
       edge_color='lightgray')

In [None]:
# Around the word "trouble"
G=nx.ego_graph(G=graph_of_words, radius=1, n='trouble')
partition = community_louvain.best_partition(G)
pos = community_layout(g=G, partition=partition)
matplotlib.rcParams['figure.figsize'] = (40, 40)
nx.draw(G, pos, node_color=list(partition.values()), 
        labels=dict((n,n) for n,d in G.nodes(data=True)), font_color='black', font_size=8, font_weight='bold',
       edge_color='lightgray')

In [None]:
# Around the combination of each 3 words
G=nx.compose_all([nx.ego_graph(G=graph_of_words, radius=1, n='issue'), 
                 nx.ego_graph(G=graph_of_words, radius=1, n='problem'),
                 nx.ego_graph(G=graph_of_words, radius=1, n='trouble')])
partition = community_louvain.best_partition(G)
pos = community_layout(g=G, partition=partition)
matplotlib.rcParams['figure.figsize'] = (40, 40)
nx.draw(G, pos, node_color=list(partition.values()), 
        labels=dict((n,n) for n,d in G.nodes(data=True)), font_color='black', font_size=8, font_weight='bold',
       edge_color='lightgray')

In [None]:
# This line of code probably does not work, didn't have time to finish this but the idea is to
# save the communities (a.k.a topics) and the words in them in a table
df = pd.DataFrame([partition]).T

### Sentiment analysis

In [None]:
from textblob import TextBlob

In [None]:
def sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

data["sentiment"] = raw_data["text"].map(sentiment)

In [None]:
data.head()