From here, the following are the modified code by myself.

In [1]:
import pickle
from pprint import pprint

In [2]:
pycon_dict = pickle.load(open("data/pycon_dict.pkl", "rb"))

In [3]:
import numpy as np
import pandas as pd

In [4]:
columns = ['message', 'category']

In [5]:
categories_map = {0: u'Business & CEOs',
                  1: u'Music',
                  2: u'Entertainment',
                  3: u'Fashion, Travel & Lifestyle',
                  4: u'Sports',
                  5: u'Tech',
                  6: u'Politics',
                  7: u'Science',
                  u'Business & CEOs': 0,
                  u'Entertainment': 2,
                  u'Fashion, Travel & Lifestyle': 3,
                  u'Music': 1,
                  u'Politics': 6,
                  u'Science': 7,
                  u'Sports': 4,
                  u'Tech': 5}

In [6]:
def get_dataframes(pycon_dict):
    """
    Function to get train and test dataframes (without any preprocessing).
    
    Parameters:
    ----------
    pycon_dict: The twitter user dictionary being used.
    
    Returns:
    -------
    train, test: Train and test dataframes.
    """
    train = pd.DataFrame(columns=columns)
    test = pd.DataFrame(columns=columns)
    
    for category in pycon_dict:
        for entity in pycon_dict[category]:
            train_texts = []
            test_texts = []
            num_texts = len(pycon_dict[category][entity])  # To get number of tweets
            train_indices = np.random.choice(num_texts, int(0.9 * num_texts), replace=False)  # Random selection
            test_indices = [i for i in range(num_texts) if i not in train_indices]  # Rest go into test set
            train_texts.extend(pycon_dict[category][entity][i].text for i in train_indices)  # Add to train texts
            test_texts.extend(pycon_dict[category][entity][i].text for i in test_indices)  # Add to test texts
            #### Create train dataframe ####
            train_texts = ' '.join(train_texts)
            df_train = pd.DataFrame([[train_texts, categories_map[category]]], columns=columns)
            train = train.append(df_train, ignore_index=True)
            #### Create test dataframe ####
            test_texts = ' '.join(test_texts)
            df_test = pd.DataFrame([[test_texts, categories_map[category]]], columns=columns)
            test = test.append(df_test, ignore_index=True)
            
    return train, test

In [7]:
import re
from nltk.corpus import stopwords  # for using english stopwords
from gensim.models.phrases import Phrases
from gensim.utils import deaccent, decode_htmlentities, lemmatize

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.


In [8]:
stops = stopwords.words('english')

In [9]:
def preprocess_text(tweet):
    """
    Function to process an aggregated user profile. This does the following:
    1. Decode html entities. eg. "AT&amp;T" will become "AT&T"
    2. Deaccent
    3. Remove links.
    4. Remove any user mentions (@name).
    5. Lemmatize and remove stopwords.
    
    Parameters:
    ----------
    text : String. If train_texts is a list of tweets, ' '.join and pass
    
    Returns:
    -------
    text : preprocessed (tokenized) tweet.
    """
    tweet = decode_htmlentities(tweet)
    tweet = deaccent(tweet)
    tweet = tweet.encode('ascii', 'ignore')  # To prevent UnicodeDecodeErrors later on
    tweet = re.sub(r'http\S+', '', tweet)  # Step 3
    tweet = re.sub(r'@\w+', '', tweet)  # Step 4
    tweet = tweet.split()
    tweet = lemmatize(' '.join(tweet), re.compile('(NN)'), stopwords=stops, min_length=3, max_length=15)
    tweet = [word.split('/')[0] for word in tweet]
    return tweet

In [10]:
train, test = get_dataframes(pycon_dict)

In [11]:
train_texts = train['message'].apply(preprocess_text)

In [12]:
bigram = Phrases(train_texts)  # For collocation detection

In [13]:
train_texts = [bigram[profile] for profile in train_texts]

In [14]:
categories = train['category']

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [16]:
count_vectorizer = CountVectorizer(max_features=5000)

In [17]:
train_count_features = count_vectorizer.fit_transform(' '.join(text) for text in train_texts)

In [18]:
clf_count = LogisticRegression()

In [19]:
clf_count = clf_count.fit(train_count_features, categories.astype(int))

In [20]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [21]:
dictionary = Dictionary(train_texts)

In [22]:
corpus = [dictionary.doc2bow(text) for text in train_texts]

In [23]:
from gensim.models import HdpModel

In [24]:
hdpm = HdpModel(corpus=corpus, id2word=dictionary)

In [25]:
hdpmtopics = hdpm.show_topics(num_topics=-1, num_words=10, formatted=False)

In [26]:
alpha, beta = hdpm.hdp_to_lda()
num_topics = len(hdpmtopics)
print(num_topics)

150


In [27]:
%%time
lda_hdp = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha)
lda_hdp.expElogbeta[:] = beta

CPU times: user 29.7 s, sys: 237 ms, total: 29.9 s
Wall time: 29.9 s


In [28]:
def ret_lda_features(ldamodel, texts, num_topics):
    """
    Function to return LDA inference features for texts.
    
    Parameters:
    ----------
    ldamodel: LDA model to infer docs
    texts: Texts to be inferred
    num_topics: Number of topics. Will determine columns in dataframe
    
    Returns:
    -------
    lda_features: LDA features dataframe
    """
    lda_features = pd.DataFrame()
    for message in texts:
        features = np.zeros(shape=(1, num_topics))
        inference = ldamodel[dictionary.doc2bow(message)]
        for tid, val in inference:
            features[:, tid] = val
        features = pd.DataFrame(features)
        lda_features = lda_features.append(features, ignore_index=True)
    return lda_features

In [29]:
train_lda_features = ret_lda_features(lda_hdp, train_texts, num_topics)

In [30]:
train_lda_features.shape

  def _ipython_display_formatter_default(self):
  def _formatters_default(self):
  def _deferred_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _singleton_printers_default(self):
  def _type_printers_default(self):
  def _deferred_printers_default(self):


(152, 150)

In [31]:
train_lda_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.998766,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.998535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.116588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
clf_lda = LogisticRegression()

In [33]:
clf_lda = clf_lda.fit(train_lda_features, categories.astype(int))

In [34]:
def pred_tweets(tweets, vectorizer=None, clf=None, g2v_or_lda=None):
    """
    Function to classify handle. Vectorizer need not be
    provided if g2v_or_lda is being used. g2v_or_lda can be:
        'g2v' for classification with glove word averaging
        'lda' for classification with LDA inference
        'None' for classification with standard preprocessor and vectorizer
    
    Parameters:
    ----------
    handle: Twitter handle of user to classify
    vectorizer: Vectorizer (pre-trained) to be used
    clf: Classifier (pre-trained) to be used
    g2v_or_lda: 'g2v', 'lda' or 'None' depending on which "mode" you want to use
    
    Returns:
    -------
    category: Classified category
    """
    if clf is None:
        raise Exception('Classifier has to be provided')
    if vectorizer is None and g2v_or_lda is None:
        raise Exception('Vectorizer should be provided if glove'
                        ' or LDA classification is not being used')
    profile = tweets
    #print(tweets)
    #print('tweets'+profile)
    if g2v_or_lda is None:
        profile = preprocess_text(profile)
        profile = bigram[profile]
        features = vectorizer.transform([' '.join(profile)])
    else:
        if g2v_or_lda == 'g2v':
            profile = g2v_tokenize_tweet(profile)
            features = word_averaging_list(wv, profile)
        elif g2v_or_lda == 'lda':
            lda_features = pd.DataFrame()
            tweets = preprocess_text(tweets)
            tweets = bigram[tweets]
            #print(tweets)
            features = np.zeros(shape=(1, num_topics))
            inference = lda_hdp[dictionary.doc2bow(tweets)]
            for tid, val in inference:
                features[:, tid] = val
            features=pd.DataFrame(features)
            lda_features=lda_features.append(features, ignore_index=True)
    #print(lda_features)
    category = clf.predict(lda_features)
    return category

In [35]:
#This is used to import the tweet texts that are already collected
import os
from os import listdir
import csv
import pandas

def find_csv_filenames( path_to_dir, suffix=".csv" ):
    filenames = listdir(path_to_dir)
    return [ filename for filename in filenames if filename.endswith( suffix ) ]

slash = '/'
dir = '/Users/orientchen/Research/TwitterHarvester/Tweets/'
filenames = find_csv_filenames(dir)
df = pd.DataFrame()
for name in filenames:
    #print name
    #print os.path.splitext(name)[0]
    user_id = os.path.splitext(name)[0]
    print(user_id)
    tweet_file = dir + slash + user_id + '.csv'
    #with open(tweet_file, 'rt') as f:
        #tweets = []
        #reader = csv.reader(f, delimiter=',') # good point by @paco
    headers = ["id","created_at","text"]
    df = pandas.read_csv(tweet_file,encoding='utf-8',skiprows=1,names = headers)
        #row1 = next(reader) #skip the first line because it is header
        #print(row1)
        #for row in reader:
            #print(row[2])
        #    tweets.append(row[2])
        #print(tweets)
        #print(reader["text"])
        #tweets = ' '.join(tweets)
        #print(tweets)
    #print(df['id'])
    tweets =' '.join(df['text'])
    category = pred_tweets(tweets, vectorizer='none', clf=clf_lda, g2v_or_lda='lda')[0]
    print(category)


01FirstSecond
3
145lewis
0
1RosieThomas
0
1stpattieg
5
2050group
5
20roses
0
313arch
0
350singapore
5
3arabawy
6
3F30
0
3LHD
5
4LoveClothing
1
50NerdsofGrey
0
89linz
3
__K_Z_M__
5
__kuronyan
0
_arthurchin
3
_bingze_
0
_bmturner_
0
_Buddha_Quotes
5
_EileenWalker_
5
_Felmerrr_
1
_jacobward_
5
_jessicanicol
0
_jill_jones
0
_mdrifdi_
0
_mhdrfq_
1
_myplasticheart
1
_TheWhitechapel
1
_xteebingx
4
A57portal
2
a_ndreaaaaa
0
A_WritersStudio
5
A_ZAPATA_A
2
AAFSingapore
0
Aarhus_CM
0
aaron_aw
1
aaronramsey
4
ABBgroupnews
5
AbdullahAlamr13
5
AbhiJadhavNow
5
abraham_stella
5
AbrahamHanover
5
AccioThranduil
0
acm_sg
3
Adam_Pearson
0
AdamCinemre
0
adavies47
0
Adele
1
adelinestwn
5
adesignfilmfest
1
adisyk
0
AditiAmalean
0
AdrianGdMag
1
aesionity
1
AfricasaCountry
6
agent_grace
5
AGi_architects
5
agrun
5
ah_kua_show
5
ahartl_1
5
ahec_anz
1
ahec_europe
1
ahmed_oo
5
ahpau_
1
AIGAdesign
5
AikBengChia
1
aimementoring
5
AJ_AR_BDM
5
AJAMMornings
5
AJCosmokids
0
ajjuliani
5
ajplus
6
ajplusarabi
0
ajplusespan