# 1. Data to be used

All data is from the CIA World Factbook (https://www.cia.gov/library/publications/resources/the-world-factbook/)

1. 'emissions' reports millions of megatons of carbon dioxide emitted nationally from consumption of energy
2. 'urban' is the percent of total population living in urban areas
3. 'gdp' is the gross domestic product per capita in US dollars

Links to the tables of data:

1. emissions = https://www.cia.gov/library/publications/resources/the-world-factbook/fields/274.html
2. urban = https://www.cia.gov/library/publications/resources/the-world-factbook/fields/349.html
3. gdp = https://www.cia.gov/library/publications/resources/the-world-factbook/fields/211.html

# 2. Reading the data

In [662]:
#creating a dataframe of CO2 emmissions
import pandas as pd
link1="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/274.html"
emissions=pd.read_html(link1,header=0,flavor='bs4',attrs={'id': 'fieldListing'})[0]
emissions.head()

Unnamed: 0,Country,Carbon dioxide emissions from consumption of energy
0,Afghanistan,9.067 million Mt (2017 est.)
1,Albania,4.5 million Mt (2017 est.)
2,Algeria,135.9 million Mt (2017 est.)
3,American Samoa,"361,100 Mt (2017 est.)"
4,Angola,20.95 million Mt (2017 est.)


In [663]:
#creating a dataframe of percent urbanization
link2="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/349.html"
urban=pd.read_html(link2,header=0,flavor='bs4',attrs={'id': 'fieldListing'})[0]
urban.head()

Unnamed: 0,Country,Urbanization
0,Afghanistan,urban population: 25.5% of total population ...
1,Albania,urban population: 60.3% of total population ...
2,Algeria,urban population: 72.6% of total population ...
3,American Samoa,urban population: 87.2% of total population ...
4,Andorra,urban population: 88.1% of total population ...


In [664]:
#creating a dataframe of GDP per capita
link3="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/211.html"
gdp=pd.read_html(link3,header=0,flavor='bs4',attrs={'id': 'fieldListing'})[0]
gdp.shape

(232, 2)

# 3. Merging data sets

In [665]:
#1st merge. Confirm that country data is lining up. 
join1=pd.merge(emissions,urban,left_on='Country',right_on='Country')
join1.head()

Unnamed: 0,Country,Carbon dioxide emissions from consumption of energy,Urbanization
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...


In [666]:
#18 countries that didn't show up in both dataframes being merged were dropped.
join1.shape

(214, 3)

In [667]:
#2nd merge. Confirm that country data is lining up. 
data=pd.merge(join1,gdp,on='Country')
data.head()

Unnamed: 0,Country,Carbon dioxide emissions from consumption of energy,Urbanization,GDP - per capita (PPP)
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0..."
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1..."
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1..."
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1..."
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6..."


In [668]:
data.shape

(214, 4)

# 4. Renaming columns

In [669]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 214 entries, 0 to 213
Data columns (total 4 columns):
Country                                                214 non-null object
Carbon dioxide emissions from consumption of energy    214 non-null object
Urbanization                                           214 non-null object
GDP - per capita (PPP)                                 214 non-null object
dtypes: object(4)
memory usage: 8.4+ KB


In [670]:
data.columns

Index(['Country', 'Carbon dioxide emissions from consumption of energy',
       'Urbanization', 'GDP - per capita (PPP)'],
      dtype='object')

In [671]:
newNames=['Country','CO2 Emissions','Urbanization','GDP Per Capita']

In [672]:
nameChanges={old:new for old,new in zip(data.columns,newNames)}

In [673]:
data.rename(nameChanges,axis=1,inplace=True)

In [674]:
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0..."
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1..."
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1..."
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1..."
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6..."


In [675]:
data.dtypes

Country           object
CO2 Emissions     object
Urbanization      object
GDP Per Capita    object
dtype: object

# 5. Cleaning the CO2 Emissions Column

In [676]:
#Split values after million
emissionsnumber=[element.split(' Mt')[0] for element in data.iloc[:,1]]

#Making the above list a new column:
data=data.assign(CO2_Emissions_Number=emissionsnumber)
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita,CO2_Emissions_Number
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9.067 million
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4.5 million
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135.9 million
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20.95 million


In [677]:
# the new values have text, which tells the units, so we want to keep that somewhere

units=[] #empty list

for element in data.iloc[:,4]:
  result=element.split(' ')
  if len(result)>1:
      units.append(result[1])  # add text
  else:
    units.append(1) # add '1'

In [678]:
#Making the above list a new column:
data=data.assign(units=units)
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita,CO2_Emissions_Number,units
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9.067 million,million
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4.5 million,million
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135.9 million,million
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100,1
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20.95 million,million


In [679]:
#Now I can keep the first element (number):
emissionsnumber=[element.split(' ')[0] for element in data.iloc[:,4]]

#Making the above list a new column:
data=data.assign(CO2_Emissions_Number=emissionsnumber)
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita,CO2_Emissions_Number,units
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9.067,million
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4.5,million
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135.9,million
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,1
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20.95,million


In [680]:
data.units.value_counts() # we need to turn this into numbers

million    162
1           46
billion      6
Name: units, dtype: int64

In [681]:
newUnits=[10**6 if x=='million' else x for x in data.units] # first the millions
newUnits=[10**9 if x=='billion' else x for x in newUnits] #then the billions
# rewriting column
data=data.assign(units=newUnits)
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita,CO2_Emissions_Number,units
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9.067,1000000
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4.5,1000000
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135.9,1000000
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,1
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20.95,1000000


In [682]:
data.dtypes # checking data type

Country                 object
CO2 Emissions           object
Urbanization            object
GDP Per Capita          object
CO2_Emissions_Number    object
units                    int64
dtype: object

In [683]:
# there are values with commas, I need to:
import re
pattern='\\,'
nothing=''
testString='1,073,002'
re.sub(pattern,nothing,testString)

'1073002'

In [684]:
pattern='\\,'
nothing=''

newValues=[re.sub(pattern,nothing,oldValue) for oldValue in data.iloc[:,4]]

In [685]:
# now full numeric column
data=data.assign(CO2_Emissions_Number=newValues)
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita,CO2_Emissions_Number,units
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9.067,1000000
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4.5,1000000
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135.9,1000000
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,1
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20.95,1000000


In [686]:
data.dtypes #to see that CO2_Emissions Number is not yet float64 numbers

Country                 object
CO2 Emissions           object
Urbanization            object
GDP Per Capita          object
CO2_Emissions_Number    object
units                    int64
dtype: object

In [687]:
data.CO2_Emissions_Number=pd.to_numeric(data.CO2_Emissions_Number) #to convert to float64 numbers

In [688]:
data.dtypes #make sure it worked

Country                  object
CO2 Emissions            object
Urbanization             object
GDP Per Capita           object
CO2_Emissions_Number    float64
units                     int64
dtype: object

In [689]:
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita,CO2_Emissions_Number,units
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9.067,1000000
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4.5,1000000
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135.9,1000000
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,1
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20.95,1000000


In [690]:
#Multiply units and CO2_Emissions_Number to get the entire number
data.CO2_Emissions_Number=data.CO2_Emissions_Number*data.units

In [691]:
data.head()

Unnamed: 0,Country,CO2 Emissions,Urbanization,GDP Per Capita,CO2_Emissions_Number,units
0,Afghanistan,9.067 million Mt (2017 est.),urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0,1000000
1,Albania,4.5 million Mt (2017 est.),urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0,1000000
2,Algeria,135.9 million Mt (2017 est.),urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0,1000000
3,American Samoa,"361,100 Mt (2017 est.)",urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,1
4,Angola,20.95 million Mt (2017 est.),urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0,1000000


In [692]:
#dropping the old CO2 Emissions columns
data=data.drop('units',axis=1).drop('CO2 Emissions',axis=1)

In [693]:
data.head()

Unnamed: 0,Country,Urbanization,GDP Per Capita,CO2_Emissions_Number
0,Afghanistan,urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0
1,Albania,urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0
2,Algeria,urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0
3,American Samoa,urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0
4,Angola,urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0


## 6. Cleaning the Urbanization Column

In [694]:
urbanizationnumber=[element.split('%')[0] for element in data.Urbanization]
data=data.assign(Urbanization1=urbanizationnumber)
data.head()

Unnamed: 0,Country,Urbanization,GDP Per Capita,CO2_Emissions_Number,Urbanization1
0,Afghanistan,urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0,urban population: 25.5
1,Albania,urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0,urban population: 60.3
2,Algeria,urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0,urban population: 72.6
3,American Samoa,urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,urban population: 87.2
4,Angola,urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0,urban population: 65.5


In [695]:
urbanizationnumber=[element.split(':')[1] for element in data.Urbanization1]
data=data.assign(Urbanization_Number=urbanizationnumber)
data.head()

Unnamed: 0,Country,Urbanization,GDP Per Capita,CO2_Emissions_Number,Urbanization1,Urbanization_Number
0,Afghanistan,urban population: 25.5% of total population ...,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0,urban population: 25.5,25.5
1,Albania,urban population: 60.3% of total population ...,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0,urban population: 60.3,60.3
2,Algeria,urban population: 72.6% of total population ...,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0,urban population: 72.6,72.6
3,American Samoa,urban population: 87.2% of total population ...,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,urban population: 87.2,87.2
4,Angola,urban population: 65.5% of total population ...,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0,urban population: 65.5,65.5


In [696]:
data.dtypes

Country                  object
Urbanization             object
GDP Per Capita           object
CO2_Emissions_Number    float64
Urbanization1            object
Urbanization_Number      object
dtype: object

In [697]:
data.Urbanization_Number=pd.to_numeric(data.Urbanization_Number)

In [698]:
data.dtypes

Country                  object
Urbanization             object
GDP Per Capita           object
CO2_Emissions_Number    float64
Urbanization1            object
Urbanization_Number     float64
dtype: object

In [699]:
#dropping old Urbanization columns
data=data.drop("Urbanization",axis=1).drop("Urbanization1",axis=1)

In [700]:
data.head()

Unnamed: 0,Country,GDP Per Capita,CO2_Emissions_Number,Urbanization_Number
0,Afghanistan,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0,25.5
1,Albania,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0,60.3
2,Algeria,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0,72.6
3,American Samoa,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,87.2
4,Angola,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0,65.5


## 7. Cleaning the GDP Per Capita Column

In [701]:
#Split values after million
gdppercapita=[element.split('(')[0] for element in data.iloc[:,1]]

#Making the above list a new column:
data=data.assign(GDPPerCapita1=gdppercapita)
data.head()

Unnamed: 0,Country,GDP Per Capita,CO2_Emissions_Number,Urbanization_Number,GDPPerCapita1
0,Afghanistan,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0,25.5,"$2,000"
1,Albania,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0,60.3,"$12,500"
2,Algeria,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0,72.6,"$15,200"
3,American Samoa,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,87.2,"$11,200"
4,Angola,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0,65.5,"$6,800"


In [702]:
#Split values after million
gdppercapita=[element.split('$')[1] for element in data.GDPPerCapita1]

#Making the above list a new column:
data=data.assign(GDPPerCapita2=gdppercapita)
data.head()

Unnamed: 0,Country,GDP Per Capita,CO2_Emissions_Number,Urbanization_Number,GDPPerCapita1,GDPPerCapita2
0,Afghanistan,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0,25.5,"$2,000",2000
1,Albania,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0,60.3,"$12,500",12500
2,Algeria,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0,72.6,"$15,200",15200
3,American Samoa,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,87.2,"$11,200",11200
4,Angola,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0,65.5,"$6,800",6800


In [703]:
pattern='\\,'
nothing=''
testString='2,100'
re.sub(pattern,nothing,testString)

'2100'

In [704]:
#to get rid of the commas
pattern='\\,'
nothing=''
newValues=[re.sub(pattern,nothing,oldValue) for oldValue in data.GDPPerCapita2]

In [705]:
data=data.assign(GDP_Per_Capita_Number=newValues)
data.head()

Unnamed: 0,Country,GDP Per Capita,CO2_Emissions_Number,Urbanization_Number,GDPPerCapita1,GDPPerCapita2,GDP_Per_Capita_Number
0,Afghanistan,"$2,000 (2017 est.) $2,000 (2016 est.) $2,0...",9067000.0,25.5,"$2,000",2000,2000
1,Albania,"$12,500 (2017 est.) $12,100 (2016 est.) $1...",4500000.0,60.3,"$12,500",12500,12500
2,Algeria,"$15,200 (2017 est.) $15,200 (2016 est.) $1...",135900000.0,72.6,"$15,200",15200,15200
3,American Samoa,"$11,200 (2016 est.) $11,300 (2015 est.) $1...",361100.0,87.2,"$11,200",11200,11200
4,Angola,"$6,800 (2017 est.) $7,200 (2016 est.) $7,6...",20950000.0,65.5,"$6,800",6800,6800


In [706]:
#dropping old GDP columnns
data=data.drop("GDP Per Capita",axis=1).drop("GDPPerCapita1",axis=1).drop("GDPPerCapita2",axis=1)

In [707]:
data.dtypes

Country                   object
CO2_Emissions_Number     float64
Urbanization_Number      float64
GDP_Per_Capita_Number     object
dtype: object

In [708]:
#need to replace NA values with nothing
pattern='NA'
nothing=''
testString='NA'
re.sub(pattern,nothing,testString)

''

In [709]:
#to get rid of the NA's
pattern='NA'
nothing=''
newValues=[re.sub(pattern,nothing,oldValue) for oldValue in data.GDP_Per_Capita_Number]

In [710]:
data=data.assign(GDP_Per_Capita_Number=newValues)
data.head()

Unnamed: 0,Country,CO2_Emissions_Number,Urbanization_Number,GDP_Per_Capita_Number
0,Afghanistan,9067000.0,25.5,2000
1,Albania,4500000.0,60.3,12500
2,Algeria,135900000.0,72.6,15200
3,American Samoa,361100.0,87.2,11200
4,Angola,20950000.0,65.5,6800


In [711]:
pattern=' '
nothing=''
testString=' NA '
re.sub(pattern,nothing,testString)

'NA'

In [712]:
#to get rid of the NA's
pattern=' '
nothing=''
newValues=[re.sub(pattern,nothing,oldValue) for oldValue in data.GDP_Per_Capita_Number]

In [713]:
data=data.assign(GDP_Per_Capita_Number=newValues)
data.head()

Unnamed: 0,Country,CO2_Emissions_Number,Urbanization_Number,GDP_Per_Capita_Number
0,Afghanistan,9067000.0,25.5,2000
1,Albania,4500000.0,60.3,12500
2,Algeria,135900000.0,72.6,15200
3,American Samoa,361100.0,87.2,11200
4,Angola,20950000.0,65.5,6800


In [714]:
data.GDP_Per_Capita_Number=pd.to_numeric(data.GDP_Per_Capita_Number)

In [715]:
data.dtypes

Country                   object
CO2_Emissions_Number     float64
Urbanization_Number      float64
GDP_Per_Capita_Number    float64
dtype: object

## 8. Saving File for R

In [716]:
data.to_csv("FinalProject.csv",index=None)