In [208]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

In [209]:
pd.options.display.max_columns=50
sn.set(style="whitegrid", rc={"axes.facecolor":"#CAF1DE"})

# Creating dataframe

In [230]:
df = pd.read_csv('Work.csv')
df

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23,90
1,Wilber,Mbale,26,75
2,Robin,Gulu,25,mine
3,Tevor,Livingstone,,89
4,Yeko,Tororo,20,94
5,Miriam,Arua,Train,
6,Jesca,Mbarara,21,84


# Check the shape of the data

In [211]:
df.shape

(7, 4)

# Find the column names

In [212]:
df.columns

Index(['Name', 'city', 'age', 'py-score'], dtype='object')

# Checking for basic information

In [213]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Name      7 non-null      object
 1   city      7 non-null      object
 2   age       6 non-null      object
 3   py-score  6 non-null      object
dtypes: object(4)
memory usage: 352.0+ bytes


# Check null values

In [215]:
df.isnull().sum().sort_values(ascending=True)

Name        0
city        0
age         1
py-score    1
dtype: int64

# Check for data Types

In [216]:
df.dtypes


Name        object
city        object
age         object
py-score    object
dtype: object

# Check null values in percentage

In [217]:
for i in df.columns:
    if df[i].isnull().any():
        print(i, "----->", df[i].isnull().sum()*100/df.shape[0], "%")

age -----> 14.285714285714286 %
py-score -----> 14.285714285714286 %


# Fill Null Values 

In [218]:
df.fillna({'age': 20,'py-score':0})

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23,90
1,Wilber,Mbale,26,75
2,Robin,Gulu,25,mine
3,Tevor,Livingstone,20,89
4,Yeko,Tororo,20,94
5,Miriam,Arua,Train,0
6,Jesca,Mbarara,21,84


# Drop the null values

In [219]:
df.dropna()
# We can specify to drop the columns with missing values instead
# df.dropna(axis=)


Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23,90
1,Wilber,Mbale,26,75
2,Robin,Gulu,25,mine
4,Yeko,Tororo,20,94
6,Jesca,Mbarara,21,84


# Check for duplicates

In [191]:
df.duplicated().sum()

0

# Check Statistical detais about my data

In [169]:
df.describe(include="object").T

Unnamed: 0,count,unique,top,freq
Name,7,7,Emma,1
city,7,7,Kampala,1
age,6,6,23,1
py-score,6,6,90,1


# Check Unique values

In [170]:
for i in df.columns:
    print(i, "------", df[i].unique())

Name ------ ['Emma' 'Wilber' 'Robin' 'Tevor' 'Yeko' 'Miriam' 'Jesca']
city ------ ['Kampala' 'Mbale' 'Gulu' 'Livingstone' 'Tororo' 'Arua' 'Mbarara']
age ------ ['23' '26' '25' nan '20' 'Train' '21']
py-score ------ ['90' '75' 'mine' '89' '94' nan '84']


# Check the very first rows of my data

In [171]:
df.head()

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23.0,90
1,Wilber,Mbale,26.0,75
2,Robin,Gulu,25.0,mine
3,Tevor,Livingstone,,89
4,Yeko,Tororo,20.0,94


# Check the very last rows of my data

In [172]:
df.tail()

Unnamed: 0,Name,city,age,py-score
2,Robin,Gulu,25,mine
3,Tevor,Livingstone,,89
4,Yeko,Tororo,20,94
5,Miriam,Arua,Train,
6,Jesca,Mbarara,21,84


# Removing duplicates

In [173]:
df = df.drop_duplicates()

# Remove wrong data

In [231]:
df['age'] = df['age'].replace(np.nan, '')
df['py-score'] = df['py-score'].replace(np.nan, '')
df

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23,90
1,Wilber,Mbale,26,75
2,Robin,Gulu,25,mine
3,Tevor,Livingstone,,89
4,Yeko,Tororo,20,94
5,Miriam,Arua,Train,
6,Jesca,Mbarara,21,84


In [233]:
df['py-score'] = df['py-score'].replace(r'\D', '', regex=True)
df['age'] = df['age'].replace(r'\D', '', regex=True)
df

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23.0,90.0
1,Wilber,Mbale,26.0,75.0
2,Robin,Gulu,25.0,
3,Tevor,Livingstone,,89.0
4,Yeko,Tororo,20.0,94.0
5,Miriam,Arua,,
6,Jesca,Mbarara,21.0,84.0


In [239]:
df['age'] = df['age'].replace('',  0)
df['py-score'] = df['py-score'].replace("",  0)
df



Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23,90
1,Wilber,Mbale,26,75
2,Robin,Gulu,25,0
3,Tevor,Livingstone,0,89
4,Yeko,Tororo,20,94
5,Miriam,Arua,0,0
6,Jesca,Mbarara,21,84


# Still removing wrong data, I have noticed citties has a weird entry. So I run that column against a cleaner dataset of cities

In [240]:
from fuzzywuzzy import fuzz
df_clean_cities = pd.read_csv('ug.csv')
columns_to_remove = ['lat', 'lng', 'country', 'iso2', 'admin_name', 'capital', 'population', 'population_proper'] 
df_clean_cities.drop(columns_to_remove, axis=1, inplace=True)
df_clean_cities.to_csv('modified_ug.csv', index=False)
df_clean_cities1 = pd.read_csv('modified_ug.csv')
for index, row in df.iterrows():
    weird_city = row['city']
    best_match = None
    highest_similarity = 0
    
    # Find the best match in the clean dataset
    for clean_city in df_clean_cities1['clean-city']:
        similarity = fuzz.token_sort_ratio(weird_city, clean_city)
        if similarity > highest_similarity:
            best_match = clean_city
            highest_similarity = similarity
            
    df.at[index, 'city'] = best_match

df

Unnamed: 0,Name,city,age,py-score
0,Emma,Kampala,23,90
1,Wilber,Mbale,26,75
2,Robin,Gulu,25,0
3,Tevor,Lyantonde,0,89
4,Yeko,Tororo,20,94
5,Miriam,Arua,0,0
6,Jesca,Mbarara,21,84


In [176]:
df.to_csv('cleaned_data.csv', index=False)

# Perfoming correlation

In [241]:
df['age'] = df['age'].astype(float)
df['py-score'] = df['py-score'].astype(float)
correlation_matrix = df['age'].corr(df['py-score'])
print(correlation_matrix)

0.19439562161942436


# This correlation value indicates that there is a very low positive correlation between age and Py-score. Meaning age alone is not a strong predicator of the py-score and other factors have a more significant influence