In [None]:
! pip install --quiet spacy
! python -m spacy download en_core_web_lg

In [1]:
import pandas as pd
pd.set_option('max_colwidth', 80)

In [None]:
rus = pd.read_csv('trolls/russia_201901_1_tweets_csv_hashed.csv')
rus['tweet_time'] = rus.tweet_time.map(lambda d: pd.Timestamp(d))

In [140]:
from datetime import datetime

def _split(df, year):
    s = datetime(year=2000, month=7, day=1)
    df = df[(df['tweet_time'] > s.replace(year=year)) & 
            (df['tweet_time'] < s.replace(year=year, month=11))]
    return df.sample(750)

def split_and_label(df, splits):
    splits = [_split(df, year).assign(label = label) for year,label in splits]
    return pd.concat(splits)

In [150]:
df = split_and_label(rus, [(2016, 'y2016'), (2017, 'y2017'), (2018, 'y2018')])
X,y = df.tweet_text, df.label

In [167]:
pd.DataFrame({'tweet': X, 'label': y}).to_csv('tweets.csv')

In [161]:
from re import sub, split
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE, MDS
import seaborn as sns

def plot_embedding(V, y):
    """ Visualizes a vocabulary embedding via TSNE """
    V = TruncatedSVD(50).fit_transform(V)
    d = TSNE(metric='cosine').fit_transform(V)
    d = pd.DataFrame(d).assign(label = y.reset_index(drop=True))
    return sns.scatterplot(x = 0, y = 1, hue = 'label', data = d), d


def clean_twitter(s):
    """ Cleans Twitter specific issues
    
    Should probably clean out mentions, URLs, and RT's.
    """

    # TODO: Use regular expressions to remove unwanted
    # text and clean up our tweets to be more usable!

    # BONUS: Try using the library "spacy" to 
    # do further processing, such as lemmatizing
    # or replacing Named Entities with constants (i.e. "[NAMED]")
    # or adding the part of speech or dependency code to the word 

    s = s.strip().lower()
    return s

In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
# Let's visualize our data by using nothing but the Sklearn default
# cleaning and tokenizing

vectorizer = CountVectorizer()
V = vectorizer.fit_transform(X)
ax, d = plot_embedding(V, y)

In [None]:
# Now let's see what our cleaning has done

vectorizer = CountVectorizer(preprocessor = clean_twitter)
V = vectorizer.fit_transform(X)
ax, d = plot_embedding(V, y)

In [None]:
# Now try with TF-IDF vectorizer, and add implicit stopwords!
# Can you get things to separate in the space in a better way? 