# CIS9650 - Final Project - Part A
## Group 2 - Top 10 Economies in the World

Maria Pereyra <br>
Daiana Vega <br>
Valerie Alvarez <br>
Nubia Stefany Palacios <br>

## *******************************************
For the first part of our final project, our group decided to scrape data from Investopedia.com and investigate the countries that have the best economies in the world. We used BeautifulSoup to parse data and decided to focus on the top 10 countries to give our readers a more simplified and organized view to compare the top 10 countries and their economical data by way of a dataframe. We took our final results and exported them into a CSV file.

In [17]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# The website that we chose:
URL = "https://www.investopedia.com/insights/worlds-top-economies"

HEADERS = ({'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
            (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

# headers = user agent is data coming from the browser containing information such as the navigator(Mozilla) 
# and the language(en-US) 
# we are using it to obtain the information that we need from the website. Some websites don't return any information
# for our requests, so the above code is necessary


investopedia_response = requests.get(URL, headers=HEADERS)
# investopedia_response= here we are obtaining all the information from the web site 
# we could print this: investopedia_response.txt to see it. 

soup = BeautifulSoup(investopedia_response.content, "html.parser")
# soup = once we have the website, we are using the beautifulsoup library to use HTML to extract the information 
# that we need/want. 

rows = soup.select('div[class="mntl-sc-block-table__table-wrapper"] tr:not(:first-child)')

## rows = we used the select command to get exactly the data we needed by going to the website and finding 
## the source then finding the table with the countries div[class="mntl-sc-block-table__table-wrapper, 
## then we also selected all the rows from the table using the tr and we used not(:first-child) because we wanted all
## the rows but the first one because that one only had the heading. this code returns a list of values. 

# printing the rows

print(rows)
# for item in rows:
    # print(item)



[<tr>
<td>United States</td>
<td>$21.43</td>
<td>$21.43</td>
<td>2.2%</td>
<td>$65,298</td>
</tr>, <tr>
<td>China</td>
<td>$14.34</td>
<td>$23.52</td>
<td>6.1%</td>
<td>$10,262</td>
</tr>, <tr>
<td>Japan</td>
<td>$5.08</td>
<td>$5.46</td>
<td>0.7%</td>
<td>$40,247</td>
</tr>, <tr>
<td>Germany</td>
<td>$3.86</td>
<td>$4.68</td>
<td>0.6%</td>
<td>$46,445</td>
</tr>, <tr>
<td>India</td>
<td>$2.87</td>
<td>$9.56</td>
<td>4.2%</td>
<td>$2,100</td>
</tr>, <tr>
<td>United Kingdom</td>
<td>$2.83</td>
<td>$3.25</td>
<td>1.5%</td>
<td>$42,330</td>
</tr>, <tr>
<td>France</td>
<td>$2.72</td>
<td>$3.32</td>
<td>1.5%</td>
<td>$40,493.9</td>
</tr>, <tr>
<td>Italy</td>
<td>$2.00</td>
<td>$2.67</td>
<td>0.3%</td>
<td>$33,228.2</td>
</tr>, <tr>
<td>Brazil</td>
<td>$1.84</td>
<td>$3.23</td>
<td>1.1%</td>
<td>$8,717</td>
</tr>, <tr>
<td>Canada</td>
<td>$1.74</td>
<td>$1.93</td>
<td>1.7%</td>
<td>$46,195</td>
</tr>]


In [None]:
## Question: What is the difference between find vs. select

# "Select" finds multiple instances and returns a list, whereas "find" finds the first
# We used find because when inspecting the page source, we found exactly where the chunk of data 
# we wanted to further investigate was, and we grabbed the div=class for it. In case the same div class showed up
# somewhere else in the dataset,we wanted to make sure we were only grabbing the first/ the one we wanted.

In [9]:
#viewing just data values

data = rows 

for i in data:
   print(i.text.strip())

United States
$21.43
$21.43
2.2%
$65,298
China
$14.34
$23.52
6.1%
$10,262
Japan
$5.08
$5.46
0.7%
$40,247
Germany
$3.86
$4.68
0.6%
$46,445
India
$2.87
$9.56
4.2%
$2,100
United Kingdom
$2.83
$3.25
1.5%
$42,330
France
$2.72
$3.32
1.5%
$40,493.9
Italy
$2.00
$2.67
0.3%
$33,228.2
Brazil
$1.84
$3.23
1.1%
$8,717
Canada
$1.74
$1.93
1.7%
$46,195


In [18]:
#taking the above values and inserting them into a list

lst = []   
for x in data:
    lst.append(x.text.strip())
print(lst)

['United States\n$21.43\n$21.43\n2.2%\n$65,298', 'China\n$14.34\n$23.52\n6.1%\n$10,262', 'Japan\n$5.08\n$5.46\n0.7%\n$40,247', 'Germany\n$3.86\n$4.68\n0.6%\n$46,445', 'India\n$2.87\n$9.56\n4.2%\n$2,100', 'United Kingdom\n$2.83\n$3.25\n1.5%\n$42,330', 'France\n$2.72\n$3.32\n1.5%\n$40,493.9', 'Italy\n$2.00\n$2.67\n0.3%\n$33,228.2', 'Brazil\n$1.84\n$3.23\n1.1%\n$8,717', 'Canada\n$1.74\n$1.93\n1.7%\n$46,195']


In [4]:
top_countries = []

for row in rows:
    cells = row.select('td')
    top_countries.append({
        'Country': cells[0].text,
        'Nominal_GDP': cells[1].text,
        'PPP_Adjusted': cells[2].text,
        'Annual_Growth': cells[3].text,
        'GDP_Per_Capita': cells[4].text
    })
    
top_countries
## we created a list of dictionaries to differentiate the date for each country.

[{'Country': 'United States',
  'Nominal_GDP': '$21.43',
  'PPP_Adjusted': '$21.43',
  'Annual_Growth': '2.2%',
  'GDP_Per_Capita': '$65,298'},
 {'Country': 'China',
  'Nominal_GDP': '$14.34',
  'PPP_Adjusted': '$23.52',
  'Annual_Growth': '6.1%',
  'GDP_Per_Capita': '$10,262'},
 {'Country': 'Japan',
  'Nominal_GDP': '$5.08',
  'PPP_Adjusted': '$5.46',
  'Annual_Growth': '0.7%',
  'GDP_Per_Capita': '$40,247'},
 {'Country': 'Germany',
  'Nominal_GDP': '$3.86',
  'PPP_Adjusted': '$4.68',
  'Annual_Growth': '0.6%',
  'GDP_Per_Capita': '$46,445'},
 {'Country': 'India',
  'Nominal_GDP': '$2.87',
  'PPP_Adjusted': '$9.56',
  'Annual_Growth': '4.2%',
  'GDP_Per_Capita': '$2,100'},
 {'Country': 'United Kingdom',
  'Nominal_GDP': '$2.83',
  'PPP_Adjusted': '$3.25',
  'Annual_Growth': '1.5%',
  'GDP_Per_Capita': '$42,330'},
 {'Country': 'France',
  'Nominal_GDP': '$2.72',
  'PPP_Adjusted': '$3.32',
  'Annual_Growth': '1.5%',
  'GDP_Per_Capita': '$40,493.9'},
 {'Country': 'Italy',
  'Nominal_GDP'

In [5]:
# Converting the above dict into the dataframe

df = pd.DataFrame(top_countries)
## we used pandas library to convert the list of dictionaries into a table, 
##the way it works is by taking the key of each dictionary and 
## making it the column name and placing the values under it.
print(df)

          Country Nominal_GDP PPP_Adjusted Annual_Growth GDP_Per_Capita
0   United States      $21.43       $21.43          2.2%        $65,298
1           China      $14.34       $23.52          6.1%        $10,262
2           Japan       $5.08        $5.46          0.7%        $40,247
3         Germany       $3.86        $4.68          0.6%        $46,445
4           India       $2.87        $9.56          4.2%         $2,100
5  United Kingdom       $2.83        $3.25          1.5%        $42,330
6          France       $2.72        $3.32          1.5%      $40,493.9
7           Italy       $2.00        $2.67          0.3%      $33,228.2
8          Brazil       $1.84        $3.23          1.1%         $8,717
9          Canada       $1.74        $1.93          1.7%        $46,195


In [6]:
#exporting our dataframe into a CSV file
    
df.to_csv(r'C:\Users\daian\Desktop\cis-python\finalproject.csv', index = False)