# Cleaning the Data

## This script contains the following:
#### 1. Import Libraries
#### 2. Import Data
#### 3. Clean the Data
    Cleaning the article
    Cleaning the country names
#### 4. Exporting the Data

In [5]:
# Import libraries
import pandas as pd
import numpy as np
import nltk
import os
import re

### 2. Import Data

In [7]:
# Load the article
path = os.path.join(os.path.dirname('/Users/matthewjones/Documents/CareerFoundry/Data Visualization with Python/Achievement 1/20th-Century/02. Data/'), '20th Century Events.txt')
    
with open(path, 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

In [8]:
# Import the list of countries as a dataframe
path2 = r'/Users/matthewjones/Documents/CareerFoundry/Data Visualization with Python/Achievement 1/20th-Century/02. Data'

countries = pd.read_csv(os.path.join(path2, 'countries_list_20th_century_1.5.csv'), index_col = 0)

### 3. Clean the Data

#### Cleaning the article
- Replacing shorthand or uncommon names of countries to their more common appearance
- Removing all punctuation, numbers, and non-letters from the text document

In [11]:
from nltk.tokenize import word_tokenize

tokenized_word = word_tokenize(data)
tokenized_word

['Key',
 'events',
 'of',
 'the',
 '20th',
 'century',
 '-',
 'WikipediaJump',
 'to',
 'contentMain',
 'menuMain',
 'menumove',
 'to',
 'sidebarhide',
 'Navigation',
 'Main',
 'pageContentsCurrent',
 'eventsRandom',
 'articleAbout',
 'WikipediaContact',
 'usDonate',
 'Contribute',
 'HelpLearn',
 'to',
 'editCommunity',
 'portalRecent',
 'changesUpload',
 'fileSearchSearchAppearanceCreate',
 'accountLog',
 'inPersonal',
 'tools',
 'Create',
 'account',
 'Log',
 'in',
 'Pages',
 'for',
 'logged',
 'out',
 'editors',
 'learn',
 'moreContributionsTalkContentsmove',
 'to',
 'sidebarhide',
 '(',
 'Top',
 ')',
 '1Historic',
 'events',
 'in',
 'the',
 '20th',
 'centuryToggle',
 'Historic',
 'events',
 'in',
 'the',
 '20th',
 'century',
 'subsection1.1The',
 'world',
 'at',
 'the',
 'beginning',
 'of',
 'the',
 'century1.1.1',
 "''",
 'The',
 'war',
 'to',
 'end',
 'all',
 'wars',
 "''",
 ':',
 'World',
 'War',
 'I',
 '(',
 '1914–1918',
 ')',
 '1.1.2The',
 'Russian',
 'Revolution',
 'and',
 'Co

In [12]:
# Remove the word 'the' from the text
stop_word = ['the', 'The']

filtered_words = [] # creates an empty list
for word in tokenized_word:
    if word not in stop_word:
        filtered_words.append(word)

In [13]:
# Substitute all punctuations marks with a space 
sans_punctuation = re.sub(r"[',([)\]]",  # Search for all unnecessary punctuation
                    " ",   # Replace all characters with spaces
                    str(filtered_words))

sans_numbers = re.sub(r" \d+",
                      "",
                      str(sans_punctuation))

In [14]:
# Replacing shorthand country names in the txt file
adding_ussr = sans_numbers.replace('USSR', 'Soviet Union')
adding_us = adding_ussr.replace('U.S.', 'United States').replace('US', 'United States').replace('America', 'United States')
changing_gbr = adding_us.replace('Great Britain', 'United Kingdom').replace('Britain', 'United Kingdom')

In [15]:
changing_gbr = changing_gbr.replace('Soviet Union', 'the Soviet Union').replace('United States', 'the United States').replace('United Kingdom', 'the United Kingdom')

In [16]:
changing_gbr



#### Cleaning the country names
- Creating aliases for countries
- Removing the excess space in the country names
- Adding countries that were dissolved or broken up during the 20th Century and do not appear on our list from 2024

In [19]:
# Check the shape (209 countries)
countries.shape

(209, 1)

In [20]:
# Check the output
countries.head()

Unnamed: 0,country_name
1,Afghanistan
2,Albania
3,Algeria
4,Andorra
5,Angola


In [21]:
#Adding countries that have ceased to exist and were not on the list before
new_countries = [['Soviet Union'], ['Yugoslavia'], ['Czechoslovakia'], ['Burma'], ['Guam']]

df_new_countries = pd.DataFrame(new_countries, columns = ['country_name'])
combined_countries = pd.concat([countries, df_new_countries], ignore_index=True)
    
combined_countries

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola
...,...
209,Soviet Union
210,Yugoslavia
211,Czechoslovakia
212,Burma


In [22]:
combined_countries['country_alias'] = combined_countries['country_name']
combined_countries

Unnamed: 0,country_name,country_alias
0,Afghanistan,Afghanistan
1,Albania,Albania
2,Algeria,Algeria
3,Andorra,Andorra
4,Angola,Angola
...,...,...
209,Soviet Union,Soviet Union
210,Yugoslavia,Yugoslavia
211,Czechoslovakia,Czechoslovakia
212,Burma,Burma


In [23]:
# Create aliases for countries to match how they are referred in the article
combined_countries['country_alias'] = combined_countries['country_alias'].replace("  China, People's Republic of ", "China")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   Micronesia, Federated States of ", "Micronesia")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   North Macedonia ", "Macedonia")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("  Bosnia and Herzegovina ", "Bosnia")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("  East Timor ", "Timor")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   São Tomé and Príncipe ", "Príncipe")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   Korea, North ", "North Korea")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   Korea, South ", "South Korea")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("Soviet Union", "the Soviet Union")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   United States ", "the United States")
combined_countries['country_alias'] = combined_countries['country_alias'].replace("   United Kingdom ", "the United Kingdom")

# Countries that did not receive an alias were likely countries that did not appear in the article

In [24]:
# Convert the 'country_name' column into a list
country_list = combined_countries['country_alias'].to_list()

In [25]:
# For every item in the list, take out the additional spaces and add them to a new list
cleaned_country_list = []

for item in country_list:
    cleaned_country_list.append(item.strip())

In [26]:
# Check the output
cleaned_country_list

['Afghanistan',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas, The',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia',
 'Botswana',
 'Brazil',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo, Democratic Republic of the',
 'Congo, Republic of the',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Timor',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gabon',
 'Gambia, The',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guinea',
 'Bissau',
 'Guyana',
 'Haiti',
 'H

In [27]:
# Convert the list to a dataframe and rename the column
df_cleaned_countries = combined_countries
df_cleaned_countries['clean_country_alias'] = cleaned_country_list

In [28]:
# Check the output
df_cleaned_countries

Unnamed: 0,country_name,country_alias,clean_country_alias
0,Afghanistan,Afghanistan,Afghanistan
1,Albania,Albania,Albania
2,Algeria,Algeria,Algeria
3,Andorra,Andorra,Andorra
4,Angola,Angola,Angola
...,...,...,...
209,Soviet Union,the Soviet Union,the Soviet Union
210,Yugoslavia,Yugoslavia,Yugoslavia
211,Czechoslovakia,Czechoslovakia,Czechoslovakia
212,Burma,Burma,Burma


### 4. Exporting the Data

In [17]:
# Rerite and rename the Wikipedia text document in the Data folder
path3 = os.path.join(os.path.dirname('/Users/matthewjones/Documents/CareerFoundry/Data Visualization with Python/Achievement 1/20th-Century/02. Data/'), '20th Century Events_sans_punc.txt')
    
with open(path3, 'w') as f:
    f.write(changing_gbr)

In [29]:
# Save the new countries list dataframe in the Data folder
df_cleaned_countries.to_csv(os.path.join(path2, 'cleaned_countries_list.csv'))