In [107]:
import pandas as pd
import requests
import urllib.parse
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

### Get the tweets
filters used:

SELECT text FROM rpmdb.rpm_tweets where 

in_retweet_to_status_id is NULL and 

in_reply_to_status_id is NULL and 

quoted_status_id is NULL and 

source like "twitter web client" and 

text not like "%http%" and 

lang = "en";

In [12]:
fileName = "./tweet.dump"
original_tweets = [ line.strip() for line in open(fileName).readlines() ]

In [18]:
len(original_tweets), original_tweets[:10]

(21677,
 ["am i finally using my laptop for late night anime and not editing or whatever nonsense :')",
  'i love antarc',
  "Ohh it's March 1",
  'Silent prayers all the time for her to be healthy joyous so please get well soon',
  '18 but heh no difference.',
  'hi tlist what do you do or say after you swipe right',
  'LRT custom fe cipher playmats uuuuhgghghh soooooo cute',
  'cara too talented to just be a model ;;;;;;;;',
  'Sigh what is this feeling',
  'god the ost for the lunarites is so pretty but fuck those bitches'])

### Clean the tweets using andromeda API

In [39]:
level = 1
classes = [10, 71] #alphanumeric and hashtags
min_word_len = 5 #inclusive

In [30]:
def preprocess_text(str):
   r = requests.get("http://172.29.33.45:8000/textpreprocessor/{}?text={}".format(level, urllib.parse.quote_plus(str)))
   return r.json()

In [58]:
cleaned_tweets = []
for tweet in original_tweets:
    clean_json = preprocess_text(tweet)
    _str = ""
    for token in clean_json['token_list']:
       if token["core"]["class"] in [10]:
            _str += token["core"]["token"] + " "
    _str = _str.strip()
    if len(_str.split(" ")) >= min_word_len :
        cleaned_tweets.append(_str)

In [60]:
len(cleaned_tweets)

18811

### Preprocessing the text: 

In [81]:
nltk_stopwords = list(stopwords.words('english'))
def lemmatize_token_list(lemmatizer, token_list):
    pos_tag_list = pos_tag(token_list)
    for idx, (token, tag) in enumerate(pos_tag_list):
        tag_simple = tag[0].lower() # Converts, e.g., "VBD" to "c"
        if tag_simple in ['n', 'v', 'j']:
            word_type = tag_simple.replace('j', 'a') 
        else:
            word_type = 'n'
        lemmatized_token = lemmatizer.lemmatize(token, pos=word_type)
        token_list[idx] = lemmatized_token
    return token_list
def preprocess_text(s, tokenizer=None, remove_stopwords=True, remove_punctuation=True, 
                    stemmer=None, lemmatizer=None, lowercase=True, return_type='str'):
    # Throw an error if both stemmer and lemmatizer are not None
    if stemmer is not None and lemmatizer is not None:
         raise ValueError("Stemmer and Lemmatizer cannot both be not None!")
    
    # Tokenization either with default tokenizer or user-specified tokenizer
    if tokenizer is None:
        token_list = word_tokenize(s)
    else:
        token_list = tokenizer.tokenize(s)

    # Stem or lemmatize if needed
    if lemmatizer is not None:
        token_list = lemmatize_token_list(lemmatizer, token_list)
    elif stemmer is not None:
        token_list = stem_token_list(stemmer, token_list)
    
    # Convert all tokens to lowercase if need
    if lowercase:
        token_list = [ token.lower() for token in token_list ]
    
    # Remove all stopwords if needed
    if remove_stopwords:
        token_list = [ token for token in token_list if not token in nltk_stopwords ]
        
    # Remove all punctuation marks if needed (note: also converts, e.g, "Mr." to "Mr")
    if remove_punctuation:
        token_list = [ ''.join(c for c in s if c not in string.punctuation) for s in token_list ]
        token_list = [ token for token in token_list if len(token) > 0 ] # Remove "empty" tokens
    
    if return_type == 'list':
        return token_list
    elif return_type == 'set':
        return set(token_list)
    else:
        return ' '.join(token_list)

In [82]:
processed_tweets = []
for tweet in cleaned_tweets:
    #processed_documents[idx] = preprocess_text(doc)
    #processed_documents[idx] = preprocess_text(doc, stemmer=porter_stemmer)
    processed_tweets.append(preprocess_text(tweet, lemmatizer=WordNetLemmatizer()))

In [90]:
reduced_processed_tweets = [tweet for tweet in processed_tweets if len(tweet.split(" ")) > min_word_len]

In [91]:
len(reduced_processed_tweets)

12754

In [93]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf_model = tfidf_vectorizer.fit_transform(reduced_processed_tweets)

### K-means clustering

In [100]:
num_clusters = int(len(reduced_processed_tweets)/10)

In [101]:
km_model = KMeans(n_clusters=num_clusters)
km_model.fit(tfidf_model)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=1275, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [109]:
np.unique(km_model.labels_).shape

(1275,)

In [112]:
clusters = {}


for idx, label in enumerate(km_model.labels_):
    if label in clusters:
        clusters[label].append(reduced_processed_tweets[idx])
    else:
        clusters[label] = [reduced_processed_tweets[idx]]

In [113]:
clusters

{0: ['60 album srsly fuck break lmao'],
 1: ['take shower read genius thing spotify take god damn im glad put song b mix whim'],
 2: ['feel like part soul love since beginning everything',
  'feel like part soul love since beginning everything'],
 3: ['seriously s way much happen one episode grancrest senki',
  'kind romance siluca theo happen grancrest senki episode hint kinda blue tbh',
  'oh man episode grancrest senki damn son',
  'grancrest senki go batshit crazy episode lol',
  'ah man nt know late episode grancrest senki recap episode'],
 4: ['s reason leave first place nt change since month ago'],
 5: ['really low tolerance someone evade personal time think s ca nt r'],
 6: ['try multitask watch drama draw narimiya appear completely stop drawing lol'],
 7: ['dont want bird feel caged hence decide set free even though fledgling hope reunite w family'],
 8: ['april august gon na good month'],
 9: ['pato oward stop track lead wow'],
 10: ['new profile pic tho look like true god'],