In [9]:
"""
Issues: 
* rand_users() sampling duplicate users (should be solved)
* returning users less than 120 tweets (solved)
* not enough users to make 100 (solved)

To do:
* batch reading of json files (done)
* for users with more than 120 tweets, only consider first 120
"""
import os
import json
import numpy as np
import random
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict

data_folder = os.path.join(os.path.expanduser("~"), "Data", "research")
results_output_filename = os.path.join(data_folder, "attribution_results_control_2.json")
tweets_folder = os.path.join(data_folder, "control_tweets")

# [file1, file2, ... ]
def read_files_from_dir(datafolder):
    """ returns full file paths from tweets data folder """
    files_list = [os.path.join(tweets_folder, file) for file in os.listdir(tweets_folder)]
    return files_list

files_list = read_files_from_dir(tweets_folder)

def read_merge_tweets(files):
    """ Given an array of files, read each file (and concatenate?)"""
    dicts = []
    for file in files:
        with open(file, 'r') as inf:
            tweets_dict = json.load(inf)
            tweets = remove_low_tweet_authors(tweets_dict)
            dicts.append(tweets)
    # merge step
    super_dict = defaultdict(list)  # uses set to avoid duplicates
    for d in dicts:
        for k, v in iter(d.items()):
            super_dict[k].extend(v)
    return super_dict
    
def copy_keys(table1, keys):
    """ 
    table1 -- dict that we copy FROM.
    table2 -- dict that we copy TO.
    """
    table2 = {}
    for key in keys:
        if table1[key]:
            table2[key] = table1[key]
        else: 
            print("key does not exist in table 2")
            break
    return table2

def rand_users(users, sample_size):
    sampled = random.sample(users, sample_size)
    return sampled
    
def remove_hashtag(tweet):
    pass

def remove_at_symbol(tweet):
    pass

def remove_low_tweet_authors(tweets):
    new_dict = {}
    """ 120 is an optimal number of tweets for authorship attribution (Layton)"""
    for key in tweets:
        if len(tweets[key]) < 120:
            # tweets.pop(key, None)
            continue
        else:
            new_dict[key] = tweets[key]
    return new_dict
    
# tweets.keys()

In [10]:
# print(tweets_folder)
for file in files_list:
    print(file)
tweets = read_merge_tweets(files_list)
print(len(tweets.keys()))

/Users/lpan/Data/research/control_tweets/control_08-10-2015.json
/Users/lpan/Data/research/control_tweets/control_2.json
/Users/lpan/Data/research/control_tweets/control_3.json
170


In [11]:
# authors = {}  ## not needed for actual data mining
def join_documents(tweets):
    """ In Python 3, iteritems() has been replaced simply with items() """
    documents = []
    classes = []
    author_num = 0
    # use sorted() to enforce ordered dict iteration
    for key, value in iter(sorted(tweets.items())):
        # concatenate documents into one giant corpus
        documents.extend(value)
        # assign classes values to each respective authors' tweets
        classes.extend([author_num] * len(value))
        author_num += 1
        # print("Author: " + key + ", tweets: " + str(len(value)))
    return documents, classes

In [12]:
from sklearn.svm import SVC # support vector machines
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import grid_search

""" Set up the parameters. 'C' refers to amount of smoothing. 
Kernel introduces non-linear elements to make them linearly separable(?) 
"""
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svr = SVC()
grid = grid_search.GridSearchCV(svr, parameters)

# extract character ngrams
pipeline = Pipeline([('feature_extraction', 
                      CountVectorizer(analyzer='char', ngram_range=(3,3))),
                     ('classifier', grid)])

scores = defaultdict(list)
# iter_sample = [10, 20, 30, 40, 50]
iter_sample = 10
RUNS = 1

for sample_size in iter_sample:
    count = 0
    while count < RUNS:
        # print("Run: " + str(count + 1) + ", Samples: " + str(sample_size))
        assert(sample_size <= len(tweets.keys()))
        author_subset = rand_users(tweets.keys(), sample_size)
        assert(len(author_subset) == sample_size)
        tweets_subset = copy_keys(tweets, author_subset)
        documents, classes = join_documents(tweets_subset)
        score = cross_val_score(pipeline, documents, classes, scoring='f1')
        avg_score = np.mean(score)
        scores[sample_size].append(avg_score)
        print("Run: " + str(count + 1) + ", Samples: " + str(sample_size) + ", Score: " + str(avg_score))
        count += 1
    print("Samples: " + str(sample_size) + ", runs: " + str(RUNS))
    print("Score: {:.3f}".format(np.mean(scores[sample_size])))
    # save what we currently have
    with open(results_output_filename, 'w') as fp:
        json.dump(scores, fp)

Run: 1, Samples: 10, Score: 0.830894671403
Run: 2, Samples: 10, Score: 0.614171824479
Run: 3, Samples: 10, Score: 0.868862016009
Run: 4, Samples: 10, Score: 0.525799264817
Run: 5, Samples: 10, Score: 0.576408569936
Run: 6, Samples: 10, Score: 0.667343195639
Run: 7, Samples: 10, Score: 0.548085994194
Run: 8, Samples: 10, Score: 0.580394591249
Run: 9, Samples: 10, Score: 0.492624602333
Run: 10, Samples: 10, Score: 0.682757697122
Run: 11, Samples: 10, Score: 0.714596245509
Run: 12, Samples: 10, Score: 0.663632246273
Run: 13, Samples: 10, Score: 0.725890160649
Run: 14, Samples: 10, Score: 0.657209379559
Run: 15, Samples: 10, Score: 0.621050298761
Run: 16, Samples: 10, Score: 0.703781287846
Run: 17, Samples: 10, Score: 0.827060294868
Run: 18, Samples: 10, Score: 0.635579350022
Run: 19, Samples: 10, Score: 0.666827002911
Run: 20, Samples: 10, Score: 0.506053860011
Run: 21, Samples: 10, Score: 0.649073606658
Run: 22, Samples: 10, Score: 0.792107856332
Run: 23, Samples: 10, Score: 0.8124417799

In [13]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

df = pd.DataFrame.from_dict(scores)

NameError: name 'scores' is not defined

In [1]:
violin = sns.violin('X','Y',df,col='Z',sharex=False,sharey=False)
axes = violin.axes
axes[0,0].set_ylim(0,)
axes[0,1].set_ylim(0,)

NameError: name 'sns' is not defined