In [1]:
# Import Dependencies
from pymongo import MongoClient
from config import cloudM, cloudMpassword
import pandas as pd
import numpy as np
import datetime as dt
import csv
import re

In [2]:
# set up display area to show dataframe 
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
#create a variable for columns to keep

columns = ["Country", 
            "People and Society: Population",
            "People and Society: Nationality - noun",
            "People and Society: Nationality - adjective",
            "People and Society: Religions"]     

In [4]:
#load data and create df to clean
data = pd.read_csv("countries.csv")
#previouly determined all fields were objects, load as string for cleaning
df = pd.DataFrame(data).astype("string")
df = df.loc[:, columns].copy()
#check result
df.head(5)

Unnamed: 0,Country,People and Society: Population,People and Society: Nationality - noun,People and Society: Nationality - adjective,People and Society: Religions
0,Afghanistan,"38,346,720 (2022 est.)",Afghan(s),Afghan,"Muslim 99.7% (Sunni 84.7 - 89.7%, Shia 10 - 15..."
1,Akrotiri,"(2020) approximately 18,195 on the Sovereign B...",,,
2,Albania,"3,095,344 (2022 est.)",Albanian(s),Albanian,"Muslim 56.7%, Roman Catholic 10%, Orthodox 6.8..."
3,Algeria,"44,178,884 (2022 est.)",Algerian(s),Algerian,"Muslim (official; predominantly Sunni) 99%, ot..."
4,American Samoa,"45,443 (2022 est.)",American Samoan(s) (US nationals),American Samoan,"Christian 98.3%, other <1%, unaffiliated <1% (..."


In [5]:
#drop null values & determing number of rows 
df = df.dropna()
len(df)

228

In [6]:
#set index to country & check result
df.set_index('Country')
df.head(2)

Unnamed: 0,Country,People and Society: Population,People and Society: Nationality - noun,People and Society: Nationality - adjective,People and Society: Religions
0,Afghanistan,"38,346,720 (2022 est.)",Afghan(s),Afghan,"Muslim 99.7% (Sunni 84.7 - 89.7%, Shia 10 - 15..."
2,Albania,"3,095,344 (2022 est.)",Albanian(s),Albanian,"Muslim 56.7%, Roman Catholic 10%, Orthodox 6.8..."


In [7]:
df["People and Society: Religions"]

0      Muslim 99.7% (Sunni 84.7 - 89.7%, Shia 10 - 15...
2      Muslim 56.7%, Roman Catholic 10%, Orthodox 6.8...
3      Muslim (official; predominantly Sunni) 99%, ot...
4      Christian 98.3%, other <1%, unaffiliated <1% (...
5      Christian (predominantly Roman Catholic) 89.5,...
6      Roman Catholic 41.1%, Protestant 38.1%, other ...
7      Protestant 73.2% (includes Anglican 22.7%, Met...
9      Protestant 68.3% (Anglican 17.6%, Seventh Day ...
10     Roman Catholic 62.9%, Evangelical 15.3% (Pente...
11     Armenian Apostolic 92.6%, Evangelical 1%, othe...
12     Roman Catholic 75.3%, Protestant 4.9% (include...
14     Protestant 23.1% (Anglican 13.3%, Uniting Chur...
15     Catholic 57%, Eastern Orthodox 8.7%, Muslim 7....
16     Muslim 97.3% (predominantly Shia), Christian 2...
17     Protestant 69.9% (includes Baptist 34.9%, Angl...
18     Muslim 73.7%, Christian 9.3%, Jewish 0.1%, oth...
20                 Muslim 88.4%, other 11.6% (2020 est.)
21     Protestant 66.4% (includ

In [8]:
#make "top religion column"
df["Top Religion - Name"] = df["People and Society: Religions"].str[:11]

#replace values
df["Top Religion - Name"] = df["Top Religion - Name"].str.replace("[\d, %, \., \(]", "")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["RomanCatho"], "Roman Catholic")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["Muslimoff"], "Muslim")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["Muslimpre"], "Muslim")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["ArmenianAp"], "Armenian Apostolic")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["LamaisticB"], "Lamaistic Buddhist")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["folkreligi"], "folk religion")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["SunniMusli"], "Sunni Muslim")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["EritreanOr"], "Eritrean Orthodox")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["EkalesiaNi"], "Ekalesia Niue")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["ChurchofN"], "Ekalesia Niue")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["ChurchofS"], "Church of Sweden (Lutheran)")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["Congregatio"], "Congregational Christian Church")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["EthiopianO"], "Ethiopian Orthodox")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["GreekOrtho"], "Greek Orthodox")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["Orthodoxi"], "Orthodox")

df["Top Religion - Name"] = df["Top Religion - Name"].replace([" Buddhistor"], "Buddhist or Taoist")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["traditional"], "Buddhist and Confucian")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["SeventhDay"], " Seventh Day Adventist")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["EasternOrt"], "Eastern Orthodox")

df["Top Religion - Name"] = df["Top Religion - Name"].replace(["RussianOrt"], "Russian Orthodox")

#check results
print(df["Top Religion - Name"])

0                               Muslim
2                               Muslim
3                               Muslim
4                            Christian
5                            Christian
6                       Roman Catholic
7                           Protestant
9                           Protestant
10                      Roman Catholic
11                  Armenian Apostolic
12                      Roman Catholic
14                          Protestant
15                            Catholic
16                              Muslim
17                          Protestant
18                              Muslim
20                              Muslim
21                          Protestant
22                            Orthodox
23                      Roman Catholic
24                      Roman Catholic
25                              Muslim
26                          Protestant
27                  Lamaistic Buddhist
28                      Roman Catholic
29                       

  """


In [9]:
#make "top religion column - percent of population"
df["Top Religion - Percent of Population"] = df["People and Society: Religions"].str[:20]

#df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].str.replace("\[\]", "")

#df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].str[:10]

#remove non-digits
df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].str.replace("[a-zA-Z]", "")

df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].str.replace("[\(\)/, ;, %]", "")

df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].replace(["67.831.9"], "67.8")

df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].replace(["15-"], "17")

df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].replace(["9895"], "98")

#replace strange values


#second slice
#df["Top Religion - Percent of Population"] = df["Top Religion - Percent of Population"].str[:8]
#check results
df["Top Religion - Percent of Population"] 

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()


0       99.7
2       56.7
3           
4       98.3
5           
6       41.1
7       73.2
9       68.3
10      62.9
11         9
12      75.3
14      23.1
15        57
16      97.3
17      69.9
18      73.7
20      88.4
21      66.4
22      48.3
23      57.1
24      40.1
25      27.7
26      46.2
27         7
28        70
29      50.7
30      79.1
32      64.6
34      70.2
35        80
36       59.
37      63.2
38      87.9
39      58.6
40      77.3
41          
42      38.3
43        39
44      67.8
45        89
46      52.1
47        60
48      21.9
49      19.4
51          
52      92.3
53        98
54      29.9
55      33.1
56      62.8
58      47.5
59      86.3
60      58.9
61      72.8
62         8
63         7
64          
66        94
67      52.7
68      44.3
69      68.8
70          
71        50
72        88
73          
74      16.2
75        90
76         4
77      57.1
78      89.3
79        45
80      66.6
81        47
82        54
84      42.3
85      96.4
87          

In [10]:
# create population as of 2022

df["Population (as of 2022)"] = df["People and Society: Population"].str[:-11]

#fix certain rows
df["Population (as of 2022)"] = df["Population (as of 2022)"].replace(["596 (July"], "596")

df["Population (as of 2022)"] = df["Population (as of 2022)"].replace(["8,914,885 (2022 est.) (includes populations of"], "8,914,885")

df["Population (as of 2022)"] = df["Population (as of 2022)"].replace(["67,791,400 (2022 est.) Uni"], "67,791,400")


#check results
df["Population (as of 2022)"]

0                                            38,346,720 
2                                             3,095,344 
3                                            44,178,884 
4                                                45,443 
5                                                85,560 
6                                            34,795,287 
7                                                18,741 
9                                               100,335 
10                                           46,245,668 
11                                            3,000,756 
12                                              122,320 
14                                           26,141,369 
15                                            8,913,088 
16                                           10,353,296 
17                                              355,608 
18                                            1,540,558 
20                                          165,650,475 
21                             

In [11]:
df_r = df.drop(["People and Society: Population", "People and Society: Nationality - adjective"], axis =1)

In [12]:
#reorder column
df_r = df_r[["Country", "Population (as of 2022)", "People and Society: Nationality - noun", "People and Society: Religions", "Top Religion - Name", "Top Religion - Percent of Population"]]

In [13]:
#check result
df_r.head()

Unnamed: 0,Country,Population (as of 2022),People and Society: Nationality - noun,People and Society: Religions,Top Religion - Name,Top Religion - Percent of Population
0,Afghanistan,38346720,Afghan(s),"Muslim 99.7% (Sunni 84.7 - 89.7%, Shia 10 - 15...",Muslim,99.7
2,Albania,3095344,Albanian(s),"Muslim 56.7%, Roman Catholic 10%, Orthodox 6.8...",Muslim,56.7
3,Algeria,44178884,Algerian(s),"Muslim (official; predominantly Sunni) 99%, ot...",Muslim,
4,American Samoa,45443,American Samoan(s) (US nationals),"Christian 98.3%, other <1%, unaffiliated <1% (...",Christian,98.3
5,Andorra,85560,Andorran(s),"Christian (predominantly Roman Catholic) 89.5,...",Christian,
