In [2]:
import json
import glob
import gzip
import heapq
import os
import pandas as pd 
import re
import math
import nltk
import copy
import time
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

cachedStopWords = stopwords.words("english")   #this HUGELY improves exec time

#%%

def decontracted(phrase): #It helps the stemmer to recognize stuff
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def text_preprocessing(input_text):
    input_text = re.sub("(RT )?@[A-Za-z0-9_]+","", input_text) #strips RTs and tags
    input_text = re.sub(r'http\S+', '', input_text)    #strips URLs
    input_text = input_text.lower()    #to LowerCase
    input_text = decontracted(input_text)     #expands contractions
    input_text = re.sub("[^a-zA-Z ]+", " ", input_text).strip()    #strips numbers, punctuation and leading/ending spaces
    tokenized = nltk.word_tokenize (input_text)    #tokenizes    
    tokenized = [i for i in tokenized if not i in cachedStopWords]  #remove stopwords
    stemmer= PorterStemmer()
    tokenized = [stemmer.stem(word) for word in tokenized]   #Porter stemming

    return tokenized

def chiavi():
    chiavi = []
    rad = 'TweetsText-03-'
    for i in range(1,31):
        for j in range (1,25,3):
            chiavi.append(rad + str(i).zfill(2) + '-' + str(j).zfill(2) + '.txt')
    return chiavi

def time_window(n):
    if (n==0):
        return chiavi()[:80]
    else:
        return chiavi()[40*n:40*n+80]
    
def fill_series(diz,window):    #Takes the most common 100k terms and fills their timeseries with 0s in hours in which they are not used
    timeseries = nested_dict()
    window = time_window(window)
    for term in diz:
        #print('The normalized frequency of the term \'{}\' over time is: {}'.format(term,dict.__repr__(tf[term]).replace('TweetsText-','')))
        timeseries[term] = copy.deepcopy(tf[term])
        for key, value in list(timeseries[term].items()):
            if key not in window:
                del timeseries[term][key]
        for el in window:
            if el not in timeseries[term]:
                timeseries[term][el] = 0
    return timeseries
    

#%% Opens the json files contained in the jsonl.gz archives, processes them and create some partial output txt files 

path = r'/Users/lorenzodetomasi/Desktop/esame_stilo/prova'
#df = pd.DataFrame()
#start = time.time()

for filename in glob.glob(os.path.join(path, '*.jsonl.gz')):
    
    nome = 'TweetsText' + filename[-18:-9]    #It will be  '-MM-DD-HH'  (Month-Day-Hour)    
    print("Started creating {}".format(nome))
    
    with gzip.open(filename, 'rb') as f:
     if (nome + '.txt') not in os.listdir(path):   #Checks if this one hasn't already been processed
      with open(path + '//' + nome + '.txt', 'w') as g:
        for jsonObj in f:    #Each tweet in a file is a different json object
            tweetDict = json.loads(jsonObj)
            if "retweeted_status" in tweetDict:   #retweets and normal tweets have a DIFFERENT STRUCTURE! the former's "full_text" is truncated, so it's necessary to load the original tweet
                g.write('\n'.join(text_preprocessing(tweetDict['retweeted_status']['full_text'])))
            else:
                g.write('\n'.join(text_preprocessing(tweetDict['full_text'])))


Started creating TweetsText-03-11-19
Started creating TweetsText-03-18-13
Started creating TweetsText-03-14-04
Started creating TweetsText-03-15-07
Started creating TweetsText-03-19-10
Started creating TweetsText-03-18-19
Started creating TweetsText-03-16-01
Started creating TweetsText-03-11-13
Started creating TweetsText-03-10-10
Started creating TweetsText-03-13-22
Started creating TweetsText-03-14-13
Started creating TweetsText-03-18-04
Started creating TweetsText-03-13-01
Started creating TweetsText-03-19-07
Started creating TweetsText-03-16-22
Started creating TweetsText-03-15-10
Started creating TweetsText-03-11-04
Started creating TweetsText-03-14-19
Started creating TweetsText-03-10-07
Started creating TweetsText-03-13-07
Started creating TweetsText-03-19-01
Started creating TweetsText-03-17-19
Started creating TweetsText-03-12-04
Started creating TweetsText-03-16-10
Started creating TweetsText-03-15-22
Started creating TweetsText-03-10-01
Started creating TweetsText-03-17-13
S

In [3]:
import collections 
path = r'/Users/lorenzodetomasi/Desktop/esame_stilo/prova'
def nested_dict():
    return collections.defaultdict(nested_dict)

max_freq={}
idf={}
tf=nested_dict()
docs = 0

for filename in os.listdir(path):
  if filename.endswith(".txt"): 
    print("Started processing {}".format(filename))
    docs+=1
    max_freq[filename] = 0
    
    freq = {}
    with open(path + '/' + filename) as f:
        words = f.read().splitlines()

    for word in words:                                                    
        if word not in freq:
            freq[word] = 0
            if word in idf:
                idf[word]+=1
            else:
                idf[word] = 1
        freq[word] += 1
        max_freq[filename] = max(max_freq[filename],freq[word])
                    
    for word in words:
       tf[word][filename]=freq[word]/max_freq[filename]
   
for a in idf:
    idf[a]=math.log(docs/idf[a],2)   
    
tf_idf=copy.deepcopy(tf)
sums={}
for a in tf_idf:
    for b in tf_idf[a]:
            tf_idf[a][b] = tf_idf[a][b]*idf[a]    
    sums[a] = sum(tf_idf[a].values()) 
    
#top100k = heapq.nlargest(100000, sums, key=sums.__getitem__)  #Only top 100k tokens are necessary 
n = 100000     
top100k = dict(collections.Counter(sums).most_common(n))                                                            
#print('\n Top {} words: '.format(n))
#print(dict(collections.Counter(sums).most_common(n)))
timeseries1 = fill_series(top100k,0)
timeseries2 = fill_series(top100k,1)
timeseries3 = fill_series(top100k,2)
timeseries4 = fill_series(top100k,3)
timeseries5 = fill_series(top100k,4)

PAA = pd.DataFrame()
transformer = PiecewiseAggregateApproximation(window_size=8)
for diz in timeseries1:
    for key, value in diz.items():
        PAA[key] = transformer.transform(np.array(list(timeseries1[diz].values())).reshape(1,-1))

       
#%%  TESTS
       
#import time
#start = time.time()
#for i in range (1000):
#    text_preprocessing(tweetDict['full_text'])
#end = time.time() 
#print(end-start)       

Started processing TweetsText-03-11-04.txt
Started processing TweetsText-03-16-19.txt
Started processing TweetsText-03-11-10.txt
Started processing TweetsText-03-13-01.txt
Started processing TweetsText03-11-22.txt
Started processing TweetsText03-18-13.txt
Started processing TweetsText03-18-07.txt
Started processing TweetsText03-14-07.txt
Started processing TweetsText03-14-13.txt
Started processing TweetsText-03-11-13.txt
Started processing TweetsText-03-11-07.txt
Started processing TweetsText03-18-04.txt
Started processing TweetsText03-18-10.txt
Started processing TweetsText03-16-01.txt
Started processing TweetsText-03-14-22.txt
Started processing TweetsText-03-18-22.txt
Started processing TweetsText03-13-19.txt
Started processing TweetsText03-14-10.txt
Started processing TweetsText03-14-04.txt
Started processing TweetsText-03-13-13.txt
Started processing TweetsText-03-13-07.txt
Started processing TweetsText03-11-19.txt
Started processing TweetsText03-16-10.txt
Started processing Tweet

NameError: name 'PiecewiseAggregateApproximation' is not defined

In [None]:
timeseries

In [None]:
timeseries_flatted = pd.DataFrame.from_dict({(i,j): timeseries[i][j] 
                           for i in timeseries.keys() 
                           for j in timeseries[i].keys()},
                       orient='index')

In [None]:
timetimeseries_flatted=timeseries_flatted.reset_index()


In [None]:
timetimeseries_flatted = timetimeseries_flatted.rename(columns = {'index':'filename'})
timetimeseries_flatted = timetimeseries_flatted.rename(columns = {0:'value'})



In [None]:
timetimeseries_flatted

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(timetimeseries_flatted['value'])

In [None]:
timetimeseries_flatted.filename.values

In [None]:
values_list = list(timetimeseries_flatted.filename.values.flatten())

In [1]:
values_list.to_list()


NameError: name 'values_list' is not defined