<a href="https://colab.research.google.com/github/maneakansha36/my_first_repository/blob/main/data_acquisition_akanksha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

Load the CSV file using pandas.read_csv()

In [2]:
population = pd.read_csv("/content/population.csv")
population

Unnamed: 0,country,population
0,United States,1278642419
1,China,792846414
2,India,1001406378
3,Japan,1206263687
4,Germany,428734972
5,Russia,420968276
6,Brazil,675094950
7,United Kingdom,674991378
8,France,434389014
9,Italy,254467210


Load the Excel file using pandas.read_excel()

In [3]:
gdp_data = pd.read_excel("/content/gdp.xlsx")
gdp_data

Unnamed: 0,Country,GDP
0,United States,10450436373455
1,China,5086547998939
2,India,14540718118693
3,Japan,12473835857216
4,Germany,2038442396490
5,Russia,10987125474745
6,Brazil,8793052816619
7,United Kingdom,11799848684255
8,France,7758669752269
9,Italy,12907234326850


In [4]:
internet_users =pd.read_json("/content/internet_users.json")
internet_users

Unnamed: 0,country,internet_users
0,United States,745595490
1,China,685275917
2,India,606850704
3,Japan,1112451555
4,Germany,246827121
5,Russia,162156967
6,Brazil,24027075
7,United Kingdom,266047041
8,France,216417853
9,Italy,87191493


Parse the XML file using xml.etree.ElementTree

In [5]:
literacy_rate = pd.read_xml("/content/literacy_rate.xml")
literacy_rate

Unnamed: 0,name,literacy_rate
0,United States,72.7
1,China,82.2
2,India,80.3
3,Japan,97.5
4,Germany,92.9
5,Russia,89.1
6,Brazil,81.0
7,United Kingdom,82.9
8,France,97.6
9,Italy,83.7


# **Transform Data:**

Standardize country names across all datasets

In [6]:
literacy_rate =literacy_rate.rename(columns={'name':'country'})
literacy_rate

Unnamed: 0,country,literacy_rate
0,United States,72.7
1,China,82.2
2,India,80.3
3,Japan,97.5
4,Germany,92.9
5,Russia,89.1
6,Brazil,81.0
7,United Kingdom,82.9
8,France,97.6
9,Italy,83.7


In [7]:
gdp_data = gdp_data.rename(columns ={'Country': 'country'})
gdp_data

Unnamed: 0,country,GDP
0,United States,10450436373455
1,China,5086547998939
2,India,14540718118693
3,Japan,12473835857216
4,Germany,2038442396490
5,Russia,10987125474745
6,Brazil,8793052816619
7,United Kingdom,11799848684255
8,France,7758669752269
9,Italy,12907234326850


Convert numeric columns to appropriate types

In [8]:
population['population'].dtype

dtype('int64')

In [9]:
gdp_data['GDP'].dtype

dtype('int64')

In [10]:
internet_users['internet_users'].dtype

dtype('int64')

In [11]:
literacy_rate["literacy_rate"].dtype

dtype('float64')

Handle missing or erroneous data

In [12]:
population.isnull().sum()

Unnamed: 0,0
country,0
population,0


In [13]:
gdp_data.isnull().sum()

Unnamed: 0,0
country,0
GDP,0


In [14]:
internet_users.isnull().sum()

Unnamed: 0,0
country,0
internet_users,0


In [15]:
literacy_rate.isnull().sum()

Unnamed: 0,0
country,0
literacy_rate,0


Create new columns such as internet penetration rate (internet users / population × 100)

In [16]:
internet_users['internet_penetration_rate'] =internet_users['internet_users']/population['population']*100

In [18]:
del internet_users['internet_penetration_rate']

In [19]:
internet_users

Unnamed: 0,country,internet_users
0,United States,745595490
1,China,685275917
2,India,606850704
3,Japan,1112451555
4,Germany,246827121
5,Russia,162156967
6,Brazil,24027075
7,United Kingdom,266047041
8,France,216417853
9,Italy,87191493


# Integrate Data:

Merge the datasets based on the country name

In [21]:
data = pd.merge(population,literacy_rate, on='country', how='outer')


data = pd.merge(data, internet_users, on='country', how='outer')

data = pd.merge(data , gdp_data, on='country', how='outer')

display(data)

Unnamed: 0,country,population,literacy_rate,internet_users,GDP
0,Argentina,1432830251,67.8,421477197,8767336567320
1,Australia,439285667,71.6,201619113,5444267975247
2,Brazil,675094950,81.0,24027075,8793052816619
3,Canada,1438267572,70.8,894102645,8158351525190
4,China,792846414,82.2,685275917,5086547998939
5,France,434389014,97.6,216417853,7758669752269
6,Germany,428734972,92.9,246827121,2038442396490
7,India,1001406378,80.3,606850704,14540718118693
8,Indonesia,93409749,75.4,4584702,12404144955318
9,Italy,254467210,83.7,87191493,12907234326850


Handle unmatched or missing records

In [22]:
missing_data = data.isna().sum()
missing_data

Unnamed: 0,0
country,0
population,0
literacy_rate,0
internet_users,0
GDP,0


Validate the data for consistency and completeness

In [23]:
print("data_types:\n", data.dtypes)

data_types:
 country            object
population          int64
literacy_rate     float64
internet_users      int64
GDP                 int64
dtype: object


In [26]:
invalid_literacy = data[(data['literacy_rate'] < 0) | (data['literacy_rate'] > 100)]
invalid_literacy

invalid_population = data[data['population'] <= 0]
invalid_population

data['internet_penetration_rate'] = data['internet_users'] / data['population'] * 100
invalid_penetration = data[(data['internet_penetration_rate'] < 0) | (data['internet_penetration_rate'] > 100)]
invalid_penetration

Unnamed: 0,country,population,literacy_rate,internet_users,GDP,internet_penetration_rate


# Analyze Data:

Find countries with highest internet penetration rates

In [27]:
data.sort_values('internet_penetration_rate', ascending= False)

Unnamed: 0,country,population,literacy_rate,internet_users,GDP,internet_penetration_rate
10,Japan,1206263687,97.5,1112451555,12473835857216,92.222917
16,Spain,898664919,60.6,794943861,15275513099842,88.458317
4,China,792846414,82.2,685275917,5086547998939,86.432366
14,South Africa,916989541,87.7,739194223,3328747361982,80.610976
3,Canada,1438267572,70.8,894102645,8158351525190,62.165251
7,India,1001406378,80.3,606850704,14540718118693,60.599844
19,United States,1278642419,72.7,745595490,10450436373455,58.311493
6,Germany,428734972,92.9,246827121,2038442396490,57.571026
17,Turkey,247285876,71.4,128924625,2581127745611,52.135863
5,France,434389014,97.6,216417853,7758669752269,49.821208


Calculate average literacy rate across countries

In [28]:
avg_rate = data["literacy_rate"].mean()
print("Average Literacy Rate across countries:", avg_rate)

Average Literacy Rate across countries: 78.43


Investigate relationships between GDP, population, literacy, and internet usage

In [29]:

corr = data[["GDP", "population", "literacy_rate", "internet_users"]].corr()
print(corr)

                     GDP  population  literacy_rate  internet_users
GDP             1.000000    0.183118      -0.081700        0.212559
population      0.183118    1.000000      -0.229585        0.749573
literacy_rate  -0.081700   -0.229585       1.000000        0.074808
internet_users  0.212559    0.749573       0.074808        1.000000


# Load into SQLite:

Create an SQLite database

In [30]:
import sqlite3

conn = sqlite3.connect("data")

cursor = conn.cursor()

Load the final merged dataset into a database table

In [31]:
sql = data.to_sql("final_data", conn, if_exists='replace',index=False)
sql

20

In [32]:
cursor = conn.cursor()

cursor.execute("SELECT * FROM final_data LIMIT 5")
rows = cursor.fetchall()

for row in rows:
    print(row)

('Argentina', 1432830251, 67.8, 421477197, 8767336567320, 29.415710388990107)
('Australia', 439285667, 71.6, 201619113, 5444267975247, 45.89703879412028)
('Brazil', 675094950, 81.0, 24027075, 8793052816619, 3.559066024712524)
('Canada', 1438267572, 70.8, 894102645, 8158351525190, 62.165250917581005)
('China', 792846414, 82.2, 685275917, 5086547998939, 86.4323663321759)


# Write and execute SQL queries to extract insights

Countries with highest internet penetration

In [33]:
cursor.execute("""SELECT country, internet_penetration_rate
FROM final_data
ORDER BY internet_penetration_rate DESC
LIMIT 10""" )
rows = cursor.fetchall()
for i in rows:
  print(i)

('Japan', 92.22291667974251)
('Spain', 88.45831679783196)
('China', 86.4323663321759)
('South Africa', 80.61097645605535)
('Canada', 62.165250917581005)
('India', 60.59984411243684)
('United States', 58.311493418395656)
('Germany', 57.57102571982394)
('Turkey', 52.135862785790486)
('France', 49.82120772511065)


Average literacy rate

In [34]:
cursor.execute("SELECT AVG(literacy_rate) FROM final_data")
print(" Average Literacy Rate:", cursor.fetchone()[0])

 Average Literacy Rate: 78.43


Correlation between literacy rate and internet penetration

In [35]:
cursor.execute("""SELECT
    (AVG( internet_penetration_rate	* literacy_rate) - AVG(internet_penetration_rate	) * AVG(literacy_rate)) /
    (SQRT((AVG(internet_penetration_rate	 * internet_penetration_rate	) - AVG(internet_penetration_rate	)*AVG(internet_penetration_rate	)) *
          (AVG(literacy_rate * literacy_rate) - AVG(literacy_rate)*AVG(literacy_rate))))
    AS correlation
FROM final_data;""")

print(cursor.fetchall())

[(0.23533009202708055,)]


GDP per capita analysis

In [36]:
cursor.execute("""
SELECT Country,
       ROUND(GDP / Population, 2) AS gdp_per_capita
FROM final_data

""")

print(cursor.fetchall())

[('Argentina', 6118.0), ('Australia', 12393.0), ('Brazil', 13024.0), ('Canada', 5672.0), ('China', 6415.0), ('France', 17861.0), ('Germany', 4754.0), ('India', 14520.0), ('Indonesia', 132792.0), ('Italy', 50722.0), ('Japan', 10340.0), ('Mexico', 4249.0), ('Russia', 26099.0), ('Saudi Arabia', 9239.0), ('South Africa', 3630.0), ('South Korea', 10609.0), ('Spain', 16998.0), ('Turkey', 10437.0), ('United Kingdom', 17481.0), ('United States', 8173.0)]


Which countries have a GDP per capita above $10,000?

In [37]:
cursor.execute("""SELECT country
FROM final_data
WHERE ROUND(GDP / population, 2) > 10000""")
print(cursor.fetchall())

[('Australia',), ('Brazil',), ('France',), ('India',), ('Indonesia',), ('Italy',), ('Japan',), ('Russia',), ('South Korea',), ('Spain',), ('Turkey',), ('United Kingdom',)]


What is the total population covered in the dataset?

In [38]:
cursor.execute(""" SELECT SUM(Population) AS total_population
FROM final_data;
""")
print(cursor.fetchall())

[(14864685089,)]


Which countries have the lowest literacy rates, and how does that impact internet access?

In [39]:
 cursor.execute("""SELECT Country,literacy_rate,internet_penetration_rate
FROM final_data
ORDER BY literacy_rate ASC;""")

lowest_literacy_countries = cursor.fetchmany(10)

for row in lowest_literacy_countries:
    print(f"Country: {row[0]}, Literacy Rate: {row[1]}, Internet Penetration Rate: {row[2]}")

Country: Saudi Arabia, Literacy Rate: 60.5, Internet Penetration Rate: 11.36642948901554
Country: Spain, Literacy Rate: 60.6, Internet Penetration Rate: 88.45831679783196
Country: South Korea, Literacy Rate: 66.4, Internet Penetration Rate: 47.165660945429124
Country: Argentina, Literacy Rate: 67.8, Internet Penetration Rate: 29.415710388990107
Country: Canada, Literacy Rate: 70.8, Internet Penetration Rate: 62.165250917581005
Country: Turkey, Literacy Rate: 71.4, Internet Penetration Rate: 52.135862785790486
Country: Australia, Literacy Rate: 71.6, Internet Penetration Rate: 45.89703879412028
Country: United States, Literacy Rate: 72.7, Internet Penetration Rate: 58.311493418395656
Country: Indonesia, Literacy Rate: 75.4, Internet Penetration Rate: 4.9081622090644945
Country: Mexico, Literacy Rate: 76.5, Internet Penetration Rate: 20.74684477664874



What are the top 5 wealthiest countries by total GDP, and how does that compare with population size?

In [40]:
cursor.execute(""" SELECT
    Country,
    Population,
    GDP,
    (GDP * Population) AS GDP_total
FROM final_data
ORDER BY GDP_total DESC
LIMIT 5;
""")

wealtheast_countries = cursor.fetchall()

for row in wealtheast_countries:
    print(f"Country: {row[0]}, population: {row[1]}, GDP: {row[2]}")

Country: Japan, population: 1206263687, GDP: 12473835857216
Country: India, population: 1001406378, GDP: 14540718118693
Country: Spain, population: 898664919, GDP: 15275513099842
Country: United States, population: 1278642419, GDP: 10450436373455
Country: Argentina, population: 1432830251, GDP: 8767336567320


Find countries where internet users exceed 70% of the population

In [41]:
cursor.execute(""" SELECT Country,internet_penetration_rate,population
FROM final_data
WHERE internet_penetration_rate > 70
ORDER BY internet_penetration_rate DESC;
""")

internet_70 = cursor.fetchall()

for row in internet_70:
    print(f"Country: {row[0]},internet_penetration_rate : {row[1]}, population:{row[2]}")

Country: Japan,internet_penetration_rate : 92.22291667974251, population:1206263687
Country: Spain,internet_penetration_rate : 88.45831679783196, population:898664919
Country: China,internet_penetration_rate : 86.4323663321759, population:792846414
Country: South Africa,internet_penetration_rate : 80.61097645605535, population:916989541


What is the average GDP per capita for countries with internet penetration above 50

In [42]:
cursor.execute(""" SELECT country, AVG(GDP) AS avg_gdp_per_capita_high_internet
FROM final_data
WHERE internet_penetration_rate > 50;
""")
print(cursor.fetchall())

[('Canada', 8214857830824.223)]


How many countries have a literacy rate above 90%, and what is their average internet penetration?

In [43]:
cursor.execute("""select country, AVG(literacy_rate), AVG(internet_penetration_rate)
from final_data
where literacy_rate >90
""")
print(cursor.fetchall())

[('France', 96.0, 66.53838337489236)]


In [44]:
conn.close()