In [9]:
import os
import re
import nltk
from nltk import word_tokenize , sent_tokenize , pos_tag
from tdmh import *
import pandas as pd


# Organise the tweets

Read all the downloaded tweets from the TSV files.

In [10]:
path = os.path.join( 'HanaKimura' , 'tweets_hanakimura.tsv' )
df1 = pd.read_csv( path  , encoding = 'utf-8' , sep = '\t' )

path = os.path.join( 'HanaKimura' , 'tweets_hanakimura_japanese.tsv' )
df2 = pd.read_csv( path  , encoding = 'utf-8' , sep = '\t' )

path = os.path.join( 'HanaKimura' , 'tweets_kimurahana.tsv' )
df3 = pd.read_csv( path  , encoding = 'utf-8' , sep = '\t' )

Change the 'created_at' date into a 'dateframe'

In [11]:
df = pd.concat([df1, df2, df3])
df['created_at']  = pd.to_datetime( df['created_at'] ).dt.tz_localize(None)

The date of Hana Kimura's death

In [12]:
date = pd.to_datetime( '2020-05-23' )

In [18]:
dir = 'Corpus'

if not os.path.exists(dir):
    os.mkdir(dir)

path = os.path.join(dir, 'tweets_before_japanese.txt')
out_before_japanese = open(  path, 'w' , encoding = 'utf-8')

path = os.path.join(dir, 'tweets_before_english.txt' )
out_before_english = open( path , 'w' , encoding = 'utf-8')

path = os.path.join(dir, 'tweets_after_japanese.txt' )
out_after_japanese = open( path , 'w' , encoding = 'utf-8')

path = os.path.join(dir, 'tweets_after__english.txt' )
out_after_english = open( path , 'w' , encoding = 'utf-8')


for i,row in df.iterrows():
    if not re.search( r'^RT' , row['text'] ):
        if row['created_at'] < date:
            if row['lang'] == 'ja' :
                out_before_japanese.write( f"{row['text']}\n" )
            else:
                out_before_english.write( f"{row['text']}\n" )
        else:
            if row['lang'] == 'ja':
                out_after_japanese.write( f"{row['text']}\n" )
            else:
                out_after_english.write( f"{row['text']}\n" )
            
        
out_before_japanese.close()
out_before_english.close()
out_after_japanese.close()
out_after_english.close()
    

## Create a list of all the files in the corpus

In [19]:


texts = []
dir = 'Corpus'

for file in os.listdir(dir):
    if re.search( r'txt$' , file ):
        path = os.path.join( dir , file )
        texts.append(path)


In [15]:
## create text analysis functions

In [16]:
def get_title(path):
    title = os.path.basename(path)
    if re.search( r'txt$' , title ):
        # Remove txt extension
        title = title[ :title.index('.txt') ]
        # remove commas and dots
        title = re.sub( r'[.,]' , '' , title )
    return title

## create data for all  the texts in the corpus

In [21]:
out = open( 'data.csv' , 'w' , encoding = 'utf-8' )

pos_tags = ['JJ' , 'MD' , 'JJR' , 'JJS' , 'VBD']

## Header of the CSV file
out.write('title,tokens,sentences,ttr')

for t in pos_tags:
    out.write(f',{t}')
out.write('\n')

for text in texts:
    
    data = dict()
    print( f'Analysing {text} ...')
    
    ## Get the title, based on the filename
    title = get_title( text )
    
    ## read the full text
    fh = open( text, encoding = 'utf-8')
    full_text = fh.read()
    
    ## count the number of sentences
    sentences = sent_tokenize(full_text)
    data['nr_sentences'] = len(sentences)
    
    # dictionary to count the POS tags
    freq_pos = dict()    
    
    # variables for the calculation of type-token ratio
    ttr_cap = 3000
    freq_ttr = dict()
 
    # token count is initalised at 0
    data['nr_tokens'] = 0

    for s in sentences:
        words = word_tokenize(s)
        words = remove_punctuation(words)
        
        tags = pos_tag(words)
        # Each tag consists of two values: 
        # [0]: the word and [1] the POS tag
        for word_tag in tags:
            word = word_tag[0]
            tag = word_tag[1]
            
            # count the tokens
            data['nr_tokens'] += 1
            
            # place tokens in dictionary freq_ttr
            # only if the word count is less than ttr_cap
            # The nr of items in the dictionary eventually equals the nr of types
            if data['nr_tokens'] <= ttr_cap:
                freq_ttr[ word ] = freq_ttr.get( word , 0 ) + 1
                
            ## Count frequencies of all the POS tags
            freq_pos[ tag ] = freq_pos.get( tag ,0) +1
            
    pos_tags = ['JJ' , 'MD' , 'JJR' , 'JJS' , 'VBD']
    for t in pos_tags:
        data[t] = freq_pos.get(t,0)
                
    # Calculate TTR: number of items in freq_ttr dictyionary
    # divided by ttr_cap
    data['ttr'] = len( freq_ttr ) / ttr_cap
    
    # write the results to a CSV file    
    out.write( f"{title},{data['nr_tokens']},{data['nr_sentences']},{data['ttr']}" )
    for t in pos_tags:
        out.write( f",{data[t] / data['nr_tokens'] }"  )
    out.write('\n')
    
out.close()
print('Done!')

Analysing Corpus/tweets_before_english.txt ...
Analysing Corpus/tweets_after__english.txt ...
Analysing Corpus/tweets_after_japanese.txt ...
Analysing Corpus/tweets_before_japanese.txt ...
Done!
