# Data Inconsistency

In [26]:
#data inconsistency in machine learning is a common problem in data science.
#import libraries 
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
# here willl create a dataframe with random values
data={
    'date':['2022-01-01','22-01-2002','2022-21-03','2022-01-04'],
    'country':['pakistan','pak','pk','P.a.k'],
    'name':['ali','Asif','Usman','Tayyab'],
    '2020_sales':[100,200,None,300],
    '2021_sales':[10987,20,420,None],
}

In [28]:
df=pd.DataFrame(data)
df

Unnamed: 0,date,country,name,2020_sales,2021_sales
0,2022-01-01,pakistan,ali,100.0,10987.0
1,22-01-2002,pak,Asif,200.0,20.0
2,2022-21-03,pk,Usman,,420.0
3,2022-01-04,P.a.k,Tayyab,300.0,


In [29]:
#Standardizing the date format in the dataframe
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df



Unnamed: 0,date,country,name,2020_sales,2021_sales
0,2022-01-01,pakistan,ali,100.0,10987.0
1,NaT,pak,Asif,200.0,20.0
2,NaT,pk,Usman,,420.0
3,2022-01-04,P.a.k,Tayyab,300.0,


In [30]:
#now harmonize the country names 
country_mapping = { 'pakistan': 'pakistan', 'pak': 'pakistan', 'pk': 'pakistan', 'P.a.k': 'pakistan' }
df['country'] = df['country'].map(country_mapping)

In [31]:
df


Unnamed: 0,date,country,name,2020_sales,2021_sales
0,2022-01-01,pakistan,ali,100.0,10987.0
1,NaT,pakistan,Asif,200.0,20.0
2,NaT,pakistan,Usman,,420.0
3,2022-01-04,pakistan,Tayyab,300.0,


In [32]:
#coreect the typographical mistakes in names 
df['name'] = df['name'].str.title() # capitalize the first letter of each word in the 'name' column
df


Unnamed: 0,date,country,name,2020_sales,2021_sales
0,2022-01-01,pakistan,Ali,100.0,10987.0
1,NaT,pakistan,Asif,200.0,20.0
2,NaT,pakistan,Usman,,420.0
3,2022-01-04,pakistan,Tayyab,300.0,


In [33]:
#drops duplicates from the dataframe
df.drop_duplicates()

Unnamed: 0,date,country,name,2020_sales,2021_sales
0,2022-01-01,pakistan,Ali,100.0,10987.0
1,NaT,pakistan,Asif,200.0,20.0
2,NaT,pakistan,Usman,,420.0
3,2022-01-04,pakistan,Tayyab,300.0,


In [35]:
#here we will drop the record of 2021 sales wich have less sales  record of 2020 
df = df[df['2021_sales'] >= df['2020_sales']]

In [36]:
df

Unnamed: 0,date,country,name,2020_sales,2021_sales
0,2022-01-01,pakistan,Ali,100.0,10987.0


# OutLiers

In [38]:
# Step 1: Import the required libraries
import pandas as pd
import numpy as np

# Step 2: Create the data
data = pd.DataFrame({'Age': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 50]})

data

Unnamed: 0,Age
0,20
1,21
2,22
3,23
4,24
5,25
6,26
7,27
8,28
9,29


In [41]:
# Step 3: Calculate the mean and standard deviation
mean = np.mean(data['Age'])
std = np.std(data['Age'])

# Step 4: Calculate the Z-Score
data['Z-Score'] = (data['Age'] - mean) / std

data



Unnamed: 0,Age,Z-Score
0,20,-0.938954
1,21,-0.806396
2,22,-0.673838
3,23,-0.54128
4,24,-0.408721
5,25,-0.276163
6,26,-0.143605
7,27,-0.011047
8,28,0.121512
9,29,0.25407


In [42]:
# Step 5: Print the data
print("----------------------------------------")
print(f"Here is the data with outliers:\n {data}")
print("----------------------------------------")


----------------------------------------
Here is the data with outliers:
     Age   Z-Score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628
11   50  3.037793
----------------------------------------


In [43]:
# Step 6: Print the outliers
print(f"Here are the outliers based on the z-score threshold, 3:\n {data[data['Z-Score'] > 3]}")
print("----------------------------------------")


Here are the outliers based on the z-score threshold, 3:
     Age   Z-Score
11   50  3.037793
----------------------------------------


In [44]:
# Step 7: Remove the outliers
data = data[data['Z-Score'] <= 3]

# Step 8: Print the data without outliers
print(f"Here is the data without outliers:\n {data}")

Here is the data without outliers:
     Age   Z-Score
0    20 -0.938954
1    21 -0.806396
2    22 -0.673838
3    23 -0.541280
4    24 -0.408721
5    25 -0.276163
6    26 -0.143605
7    27 -0.011047
8    28  0.121512
9    29  0.254070
10   30  0.386628


# IQR method

In [45]:
# Step 1: Import the required libraries
import pandas as pd
import numpy as np

# Step 2: Create the data
data = pd.DataFrame({'Age': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 50]})

# Step 3: Calculate the first and third quartile
Q1 = np.percentile(data['Age'], 25, interpolation = 'midpoint')
Q3 = np.percentile(data['Age'], 75, interpolation = 'midpoint')

# Step 4: Calculate the IQR
IQR = Q3 - Q1

# Step 5: Calculate the lower and upper bound
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Step 6: Print the data
print("----------------------------------------")
print(f"Here is the data with outliers:\n {data}")
print("----------------------------------------")
# Step 7: Print the outliers
print(f"Here are the outliers based on the IQR threshold:\n {data[(data['Age'] < lower_bound) | (data['Age'] > upper_bound)]}")
print("----------------------------------------")
# Step 8: Remove the outliers
data = data[(data['Age'] >= lower_bound) & (data['Age'] <= upper_bound)]

# Step 9: Print the data without outliers
print(f"Here is the data without outliers:\n {data}")

----------------------------------------
Here is the data with outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30
11   50
----------------------------------------
Here are the outliers based on the IQR threshold:
     Age
11   50
----------------------------------------
Here is the data without outliers:
     Age
0    20
1    21
2    22
3    23
4    24
5    25
6    26
7    27
8    28
9    29
10   30


# K Means 

In [46]:
# Import library
from sklearn.cluster import KMeans

# Sample data
data = [[2, 2], [3, 3], [3, 4], [30, 30], [31, 31], [32, 32]]

# Create a K-means model with two clusters (normal and outlier)
kmeans = KMeans(n_clusters=2, n_init=10)
kmeans.fit(data)

# Predict cluster labels
labels = kmeans.predict(data)

# Identify outliers based on cluster labels
outliers = [data[i] for i, label in enumerate(labels) if label == 1]

# print data
print("Data:", data)
print("Outliers:", outliers)
# Remove outliers
data = [data[i] for i, label in enumerate(labels) if label == 0]
print("Data without outliers:", data)

ModuleNotFoundError: No module named 'sklearn'