In [4]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Load Dataset.

In [31]:
url = 'https://en.wikipedia.org/wiki/World_War_II_casualties'
dfs = pd.read_html(url)

In [32]:
len(dfs)

13

# Localize CSVs

In [33]:
df = dfs[1]
df.columns = df.iloc[0]
df.drop(0).copy().to_csv('Human_losses_by_country.csv')

In [22]:
df = dfs[2]
df = df.iloc[2:8]
df.columns = df.iloc[0]
df.drop(2).to_csv('Third_Reich_losses.csv')

In [27]:
df = dfs[4]
df.columns = df.iloc[0]
df.drop(0).to_csv('Soviet_losses.csv')

In [33]:
df = dfs[5]
df.columns = df.iloc[0]
df.drop(0).to_csv('Holocaust_Jews.csv')

In [37]:
df = dfs[6]
df.columns = df.iloc[0]
df.drop(0).to_csv('Holocaust_Roma.csv')

In [48]:
df = dfs[7]
df.columns = df.iloc[0]
df.drop(0).to_csv('Casualties_by_Branch.csv')

In [51]:
!mkdir csvs
!mv *.csv ./csvs/
!ls
!ls csvs/

mkdir: csvs: File exists
mv: rename *.csv to ./csvs/*.csv: No such file or directory
Untitled.ipynb [34mcsvs[m[m
Casualties_by_Branch.csv    Human_losses_by_country.csv
Holocaust_Jews.csv          Soviet_losses.csv
Holocaust_Roma.csv          Third_Reich_losses.csv


# Cleaning Datasets

In [7]:
def remove_footnote(df, columns, num_footnotes):
    tmp = df.copy()
    pattern = re.compile(r'(.*)(\[.*\])(.*)')
    
    for _ in range(num_footnotes):
        for column in columns:
            tmp[column] = [pattern.sub(r'\1 \3', str(elem)) for elem in tmp[column]]
    return tmp

def spot_removal(df, columns):
    #Removes 'AB' from AmericaAB (note)AB
    #Removes 'A2' from (table)A2
    tmp = df.copy()
    pattern = re.compile(r'(.*[^A-Z])([A-Z]?[A-Z][0-9]?)$')
    pattern2 = re.compile(r'(.*[^A-Z])([A-Z]?[A-Z][0-9]?)(\s*\(.*\))')
    pattern3 = re.compile(r'(United Kingdom)(BE) (including Crown Colonies)')
    
    for column in columns:
        tmp[column] = [pattern.sub(r'\1', str(elem)) for elem in tmp[column]]
        tmp[column] = [pattern2.sub(r'\1\3', str(elem)) for elem in tmp[column]]
        tmp[column] = [pattern3.sub(r'\1 (\3)', str(elem)) for elem in tmp[column]]
    return tmp

def remove_commas(df, columns):
    df_noComma = pd.DataFrame()
    for elem in columns:
        df_noComma[elem] = df[elem].str.replace(',', '')
    return df_noComma

def split_xtoy(df, columns):
    df = df.copy()
    pattern = re.compile(r'(\D*)(\d*\.?\d*)(\D*)(\d*\.?\d*)(\D*)')
    for column in columns:
        df[column+'_min'] = [pattern.sub(r'\2', str(elem)) for elem in df[column]]
        df[column+'_max'] = [pattern.sub(r'\4', str(elem)) for elem in df[column]]
    return df

### Human losses by country

In [5]:
df = pd.read_csv('./csvs/Human_losses_by_country.csv')

In [6]:
df = df.drop('Unnamed: 0', axis=1)

In [43]:
df_cleaned = remove_footnote(df, df.columns, 10)
df_cleaned = spot_removal(df_cleaned, df.columns)
df_cleaned = remove_commas(df_cleaned, df.columns)
df_minmax = split_xtoy(df_cleaned, df.columns[2:])
df_cleaned = df_minmax.replace(r'^\s*$', np.nan, regex=True) #replaces whitespace with NaN

In [44]:
df_HLBC = df_cleaned[df_cleaned.columns[:2].append(df_cleaned.columns[8:])]
df_HLBC[df_HLBC.columns[1:]] = df_HLBC[df_HLBC.columns[1:]].astype(float, copy=False)

# Localize Clean CSVs

In [567]:
!mkdir clean_csvs

mkdir: clean_csvs: File exists


In [42]:
df_HLBC.to_csv('./clean_csvs/Human_losses_by_country_CLEAN.csv')