# Data Preprocessing
Data source: https://www.kaggle.com/datasets/foklacu/ukraine-war-tweets-dataset-65-days?resource=download

Download the archieve.zip file and unzip it.

## Initial clean
Choose several key features "username"/"date" and saved as a new dataset.

In [None]:
import pandas as pd
import numpy as np
import glob
import os

path = r"./content/drive/MyDrive/cse6740/archive" # use your path
all_files = os.listdir(path)

li = []

for filename in all_files:
    df = pd.read_csv(path + '/' + filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

df = pd.DataFrame(frame, columns = ['date','content', 'user', 'replyCount','retweetCount','likeCount','quoteCount'])
user_df = df['user'].map(eval).apply(pd.Series)
cleaned_df = pd.DataFrame(user_df, columns = ['username','displayname','followersCount','friendsCount','statusesCount','favouritesCount','listedCount','mediaCount','location'])
cleaned_df = cleaned_df.join(df.drop(columns = 'user'))

cleaned_df.to_csv('./content/drive/MyDrive/cse6740/cleaned_df.csv')

## sentimental analysis

https://github.com/pysentimiento/pysentimiento

In [None]:
!pip install transformers
!pip install pysentimiento

df = pd.read_csv('/content/drive/MyDrive/cse6740/cleaned_df.csv')

tweets = df['content'].tolist()
from pysentimiento.preprocessing import preprocess_tweet
tws_clean = []

for tw in tweets:
  tws_clean.append(preprocess_tweet(tw))

from pysentimiento import create_analyzer
analyzer = create_analyzer(task="sentiment", lang="es")

for tw in tws_clean:
  res.append(analyzer.predict(tw).output)

new_df = df
new_df['res'] = res

new_df.to_json('/content/drive/MyDrive/cse6740/labeled_sa_df.json')

# Geolocation labeling

A better way to get the geographical information is to use geocode lib.




In [None]:
location = df['location'].tolist()
longitude = []
latitude = []

for i in range(len(location)):
  loc = location[i]
  if str(loc) != 'nan':
    try:
      geolo = geolocator.geocode(loc)
      if geolo:
        longitude.append(str(geolo.longitude))
        latitude.append(str(geolo.latitude))
      else:
        longitude.append("nan")
        latitude.append("nan")
    except:
      longitude.append("nan")
      latitude.append("nan")

  else:
    longitude.append("nan")
    latitude.append("nan")

However, web crawling could take too much time because the number of times the webpage can make response is limited.

So instead we use keyword filtering to find the exact 

In [None]:
countryname = []

start = time.time()
for i in range(len(location)):
  flag = False
  if str(location[i]) != 'nan':
    for country in pycountry.countries:
        if country.name in location[i]:
          flag = True
          break

  if flag:
    countryname.append(country.name)
  else:
    countryname.append("Unknown")

geolocator = Nominatim(user_agent="my_user_agent", timeout = 10)
longitude = []
latitude = []

country_loc_dict = {}
for country in pycountry.countries:
  geolo = geolocator.geocode(country.name)
  if geolo:
    country_loc_dict[country.name] = [geolo.longitude, geolo.latitude]
  else:
    country_loc_dict[country.name] = [-1,-1]

country_loc_dict['Bonaire, Sint Eustatius and Saba'] = [12.1683718,-68.308183]
country_loc_dict['Holy See (Vatican City State)'] = [41.9038795,12.4520834]
country_loc_dict['Korea, Democratic People\'s Republic of']=[40.3424611,127.4310054]
country_loc_dict['Taiwan, Province of China']=[23.553118,121.0211024]
country_loc_dict['United States Minor Outlying Islands'] = [19.295374,166.6280441]
country_loc_dict['Virgin Islands, British'] = [18.4180894,-64.5854311]

for i in range(len(countryname)):
  if countryname[i] == 'Unknown':
    longitude.append(float('nan'))
    latitude.append(float('nan'))

  else:
    longitude.append(country_loc_dict[countryname[i]][0])
    latitude.append(country_loc_dict[countryname[i]][1])

new_df['country'] = countryname
new_df['longitude'] = longitude
new_df['latitude'] = latitude

new_df.to_json('/content/drive/MyDrive/cse6740/sa_loc_df.json')

# Visualization

For visualization, we only selected a small fraction of the data which the "country" column is not null.

In [None]:
new_df = pd.DataFrame(loc_df, columns = ['username','date', 'content', 'country', 'label'])
from datetime import datetime
import time

date = new_df['date'].tolist()

new_date = []

for i in date:
  i = str(i).split("+")[0]
  new_date.append(datetime.strptime(i, '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d'))

new_df['Date'] = new_date
new_df['SA'] = new_df['label'].replace({'POS':1, 'NEU': 0, 'NEG':-1})

data_sample = pd.DataFrame(new_df, columns = ['username','content','country', 'Date','SA'])
data_sample.to_csv('/content/drive/MyDrive/cse6740/sample_df.csv')