In [None]:
from math import*
import re
import logging
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
import seaborn as sns
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)

"""
Loads the DataFrame with all the TMT articles. More info on this can be found in part 1 of the TMT
recommender article series:
www.themarketingtechnologist.co/building-a-recommendation-engine-for-geek-setting-up-the-prerequisites-13/
:return: DataFrame with the title, content, tags and author of all TMT articles
"""
df = pd.read_csv('articles.csv', encoding='utf-8')         # Load articles in a DataFrame
df = df[['title', 'content_text', 'tags']]  # Slice to remove redundant columns
logging.debug("Number of articles: {0}\n".format(len(df)))

def assign_single_tag(x):
    x = x.lower().split(",")[0]
    return x if x != "" else "None"
# Clean up tags formatting
df['tags'] = df['tags'].apply(lambda x: x.replace("[", "").replace("]", ""))
# Assign first tag
df['tags_first'] = df['tags'].apply(lambda x: assign_single_tag(x))

def tokenize(text):
    """
    Tokenizes sequences of text and stems the tokens.
    :param text: String to tokenize
    :return: List with stemmed tokens
    """
    tokens = nltk.WhitespaceTokenizer().tokenize(text)
    tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens))
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens))
    stems = []
    stemmer = SnowballStemmer("english")
    for token in tokens:
        token = stemmer.stem(token)
        if token != "":
            stems.append(token)
    return stems

def get_vectorizer(ngram_range=(1, 3), min_df=2, max_df=1.0):
    """
    Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
    :param ngram_range: n-grams are created for all numbers within this range
    :param min_df: min document frequency of features
    :param max_df: max document frequency of features
    :return:
    """
    vectorizer = CountVectorizer(ngram_range=ngram_range,
                                 tokenizer=tokenize,
                                 min_df=min_df,
                                 max_df=max_df,
                                 binary=True,
                                 stop_words='english')
    return vectorizer

def reduce_dimensionality(X, n_features):
    """
    Apply PCA or SVD to reduce dimension to n_features.
    :param X:
    :param n_features:
    :return:
    """
    # Initialize reduction method: PCA or SVD
    # reducer = PCA(n_components=n_features)
    reducer = TruncatedSVD(n_components=n_features)
    # Fit and transform data to n_features-dimensional space
    reducer.fit(X)
    X = reducer.transform(X)
    logging.debug("Reduced number of features to {0}".format(n_features))
    logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
    return X

X_title = None
n_features_title = 25
vectorizer = get_vectorizer(ngram_range=(1, 2), min_df=2)
X_title = vectorizer.fit_transform(df['title'])
X_title = X_title.toarray()
X_title = np.array(X_title, dtype=float)
logging.debug("Number of features in title: {0}".format(len(vectorizer.vocabulary_)))
# Reduce dimensionality of title features
X_title = reduce_dimensionality(X_title, n_features=n_features_title)
print(X_title)