In [2]:
import os
import re
import nltk
from nltk import word_tokenize , sent_tokenize , pos_tag
from tdmh import *
import pandas as pd


# Organise the tweets

Read all the downloaded tweets from the TSV files.

In [10]:
path = os.path.join( 'HanaKimura' , 'tweets_hanakimura.tsv' )
df1 = pd.read_csv( path  , encoding = 'utf-8' , sep = '\t' )

path = os.path.join( 'HanaKimura' , 'tweets_hanakimura_japanese.tsv' )
df2 = pd.read_csv( path  , encoding = 'utf-8' , sep = '\t' )

path = os.path.join( 'HanaKimura' , 'tweets_kimurahana.tsv' )
df3 = pd.read_csv( path  , encoding = 'utf-8' , sep = '\t' )

Change the 'created_at' date into a 'dateframe'

In [11]:
df = pd.concat([df1, df2, df3])
df['created_at']  = pd.to_datetime( df['created_at'] ).dt.tz_localize(None)

The date of Hana Kimura's death

In [12]:
date = pd.to_datetime( '2020-05-23' )

In [18]:
dir = 'Corpus'

if not os.path.exists(dir):
    os.mkdir(dir)

path = os.path.join(dir, 'tweets_before_japanese.txt')
out_before_japanese = open(  path, 'w' , encoding = 'utf-8')

path = os.path.join(dir, 'tweets_before_english.txt' )
out_before_english = open( path , 'w' , encoding = 'utf-8')

path = os.path.join(dir, 'tweets_after_japanese.txt' )
out_after_japanese = open( path , 'w' , encoding = 'utf-8')

path = os.path.join(dir, 'tweets_after__english.txt' )
out_after_english = open( path , 'w' , encoding = 'utf-8')


for i,row in df.iterrows():
    if not re.search( r'^RT' , row['text'] ):
        if row['created_at'] < date:
            if row['lang'] == 'ja' :
                out_before_japanese.write( f"{row['text']}\n" )
            else:
                out_before_english.write( f"{row['text']}\n" )
        else:
            if row['lang'] == 'ja':
                out_after_japanese.write( f"{row['text']}\n" )
            else:
                out_after_english.write( f"{row['text']}\n" )
            
        
out_before_japanese.close()
out_before_english.close()
out_after_japanese.close()
out_after_english.close()
    

## Create a list of all the files in the corpus

In [3]:


texts = []
dir = 'Corpus'

for file in os.listdir(dir):
    if re.search( r'txt$' , file ):
        path = os.path.join( dir , file )
        texts.append(path)


## create text analysis functions

In [9]:
def get_all_hashtags(full_text):
    freq = dict()
    hash_tags = re.findall( r'#\w+\b' , full_text )
    for tag in hash_tags:
        freq[tag] = freq.get(tag,0) + 1
    return freq

def merge_dictionaries(x,y):
    z = x.copy()   
    z.update(y)   
    return z


def get_title(path):
    title = os.path.basename(path)
    if re.search( r'txt$' , title ):
        # Remove txt extension
        title = title[ :title.index('.txt') ]
        # remove commas and dots
        title = re.sub( r'[.,]' , '' , title )
    return title

## create data for all  the texts in the corpus

In [18]:
all_tags = dict()

for text in texts: 
    f = open( text , encoding = 'utf-8')
    full_text = f.read()
    hash_tags = get_all_hashtags(full_text)
    all_tags = merge_dictionaries(all_tags,hash_tags)

most_frequent = []
for tag in sortedByValue( all_tags , ascending = False):
    if all_tags[tag] > 200:
        most_frequent.append(tag)
    

['#誹謗中傷', '#木村花', '#スターダム', '#文春オンライン', '#フジテレビ', '#RIPHanaKimura', '#週刊文春', '#スクープ速報', '#TCS', '#テラスハウス', '#自殺', '#stardom', '#テラハ', '#SNS']


In [21]:



out = open( 'data.csv' , 'w' , encoding = 'utf-8' )

out.write('title')
for t in most_frequent:
    out.write( f',{t}' )
out.write('\n')

for text in texts:
    
    data = dict()
    print( f'Analysing {text} ...')
    
    ## Get the title, based on the filename
    title = get_title( text )
    
    ## read the full text
    fh = open( text, encoding = 'utf-8')
    full_text = fh.read()
    
    hash_tags = get_all_hashtags(full_text)
    
    out.write( f'{title}')
    for t in most_frequent:
        out.write( f',{hash_tags.get(t,0)}' )
    out.write('\n')
    
    

out.close()
print('Done!')

Analysing Corpus/tweets_before_english.txt ...
Analysing Corpus/tweets_after__english.txt ...
Analysing Corpus/tweets_after_japanese.txt ...
Analysing Corpus/tweets_before_japanese.txt ...
Done!
