## **Creating Dummy DataSet**

In [1]:
import numpy as np

labels = ['High School', 'Bachelor', 'Master', 'PhD']
size = 100

# Generate stratified choice with equal label distribution
stratified_data = np.random.choice(labels, size=size, p=[0.25, 0.25, 0.25, 0.25])

# Check the distribution of labels
label_counts = np.unique(stratified_data, return_counts=True)

print(label_counts)

for label, count in zip(label_counts[0], label_counts[1]):
    print(f"{label}: {count} occurrences")

(array(['Bachelor', 'High School', 'Master', 'PhD'], dtype='<U11'), array([33, 25, 21, 21]))
Bachelor: 33 occurrences
High School: 25 occurrences
Master: 21 occurrences
PhD: 21 occurrences


**1. Simple DataSet**

In [2]:
import pandas as pd
import numpy as np

# setting the random seed
np.random.seed(42)

# Defining the dataset
ages = np.random.randint(18, 100, size=100)
income = np.random.randint(1000, 10000, size=100)
genders = np.random.choice(['F', 'M'], size=100, p=[0.5, 0.5])
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=100, p=[0.25, 0.25, 0.25, 0.25])
marital_status = np.random.choice(['Married', 'Single', 'Divorced', 'Widowed'], size=100, p=[0.25, 0.25, 0.25, 0.25])

labels = []
for age, gender in zip(ages, genders):
    if age < 50 and gender == 'F':
        labels.append(1)
    elif age < 50 and gender == 'M':
        labels.append(0)
    elif age >= 50 and gender == 'F':
        labels.append(0)
    else:
        labels.append(1)
labels = np.array(labels)


# to the pandas dataframe
sample_data_1 = pd.DataFrame({
    'age': ages,
    'income': income,
    'gender': genders,
    'education': education,
    'marital_status': marital_status,
    'label': labels
})

display(sample_data_1)

Unnamed: 0,age,income,gender,education,marital_status,label
0,69,4561,M,Master,Single,1
1,32,7184,F,Master,Widowed,1
2,89,4099,F,Master,Widowed,0
3,78,7278,F,Bachelor,Married,0
4,38,9392,M,Bachelor,Married,0
...,...,...,...,...,...,...
95,25,8777,M,Bachelor,Married,0
96,52,1197,M,Master,Divorced,1
97,52,8125,F,Bachelor,Divorced,0
98,50,2930,M,Master,Divorced,1


**2. Dataset with even more columns**

> **which we can use to practice using various Scikit-learn preprocessing tools**

In [3]:
import pandas as pd
import numpy as np

# Define the dataset
np.random.seed(123)

ages = np.random.randint(18, 100, size=100)
genders = np.random.choice(['F', 'M'], size=100, p=[0.50, 0.50])
income = np.random.randint(1000, 10000, size=100)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], size=100, p=[0.25, 0.25, 0.25, 0.25])
marital_status = np.random.choice(['Married', 'Single', 'Divorced', 'Widowed'], size=100, p=[0.25, 0.25, 0.25, 0.25])
num_friends = np.random.randint(0, 100, size=100)
num_purchases = np.random.randint(0, 1000, size=100)
credit_score = np.random.randint(300, 850, size=100)
num_children = np.random.randint(0, 10, size=100)
city = np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], size=100, p=[0.20, 0.20, 0.20, 0.20, 0.20])

labels = []
for age, gender in zip(ages, genders):
    if age < 50 and gender == 'F':
        labels.append(1)
    elif age < 50 and gender == 'M':
        labels.append(0)
    elif age >= 50 and gender == 'F':
        labels.append(0)
    else:
        labels.append(1)
labels = np.array(labels)

sample_data_2 = pd.DataFrame({
    'age': ages,
    'gender': genders,
    'income': income,
    'education': education,
    'marital_status': marital_status,
    'num_friends': num_friends,
    'num_purchases': num_purchases,
    'credit_score': credit_score,
    'num_children': num_children,
    'city': city,
    'label': labels
})

display(sample_data_2)

Unnamed: 0,age,gender,income,education,marital_status,num_friends,num_purchases,credit_score,num_children,city,label
0,84,M,3191,High School,Married,14,760,697,4,Phoenix,1
1,35,M,9113,PhD,Single,37,0,719,1,Los Angeles,0
2,75,F,3512,High School,Widowed,96,590,453,4,Phoenix,0
3,65,F,1639,High School,Divorced,48,127,531,2,New York,0
4,91,F,4054,PhD,Widowed,71,144,645,1,Phoenix,0
...,...,...,...,...,...,...,...,...,...,...,...
95,18,F,9281,PhD,Widowed,49,23,779,3,Los Angeles,1
96,53,M,3625,High School,Married,79,987,767,3,Houston,1
97,47,F,6397,PhD,Divorced,67,4,711,9,Phoenix,1
98,19,F,5028,Bachelor,Married,24,925,599,1,Houston,1


**3. Dataset with some missing values and duplicates**

we can use to practice data **cleaning** and **preprocessing**

In [4]:
import pandas as pd
import numpy as np

# Define the dataset
np.random.seed(42)

ages = np.random.randint(18, 100, size=100)
genders = np.random.choice(['F', 'M'], size=100, p=[0.50, 0.50])
income = np.random.randint(1000, 10000, size=100)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD', np.nan], size=100)
marital_status = np.random.choice(['Married', 'Single', 'Divorced', 'Widowed'], size=100)
num_friends = np.random.randint(0, 100, size=100)
num_purchases = np.random.randint(0, 1000, size=100)
credit_score = np.random.randint(300, 850, size=100)
num_children = np.random.randint(0, 10, size=100)
city = np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], size=100)

labels = []
for age, gender in zip(ages, genders):
    if age < 50 and gender == 'F':
        labels.append(1)
    elif age < 50 and gender == 'M':
        labels.append(0)
    elif age >= 50 and gender == 'F':
        labels.append(0)
    else:
        labels.append(1)
labels = np.array(labels)

# Create the dataframe
sample_data_3 = pd.DataFrame({
    'age': ages,
    'gender': genders,
    'income': income,
    'education': education,
    'marital_status': marital_status,
    'num_friends': num_friends,
    'num_purchases': num_purchases,
    'credit_score': credit_score,
    'num_children': num_children,
    'city': city,
    'label': labels
})

# Introduce missing values
for i in range(20):
    data_index = np.random.randint(0, 100)
    column_index = np.random.randint(0, 11)
    sample_data_3.iloc[data_index, column_index] = np.nan

# Introduce duplicates
sample_data_3 = pd.concat([sample_data_3, sample_data_3.iloc[0:5]])

display(sample_data_3)

Unnamed: 0,age,gender,income,education,marital_status,num_friends,num_purchases,credit_score,num_children,city,label
0,69.0,M,7694.0,Bachelor,Single,54.0,894.0,749.0,5,New York,1.0
1,32.0,M,3385.0,Bachelor,Widowed,12.0,360.0,309.0,0,Chicago,0.0
2,89.0,M,5736.0,Bachelor,Divorced,22.0,934.0,541.0,4,New York,1.0
3,78.0,M,2802.0,High School,Single,88.0,0.0,550.0,8,Los Angeles,1.0
4,38.0,F,9155.0,,Single,98.0,386.0,304.0,9,Chicago,1.0
...,...,...,...,...,...,...,...,...,...,...,...
0,69.0,M,7694.0,Bachelor,Single,54.0,894.0,749.0,5,New York,1.0
1,32.0,M,3385.0,Bachelor,Widowed,12.0,360.0,309.0,0,Chicago,0.0
2,89.0,M,5736.0,Bachelor,Divorced,22.0,934.0,541.0,4,New York,1.0
3,78.0,M,2802.0,High School,Single,88.0,0.0,550.0,8,Los Angeles,1.0


**4. Modifying the data[`adding more features`] `+` missing `+` adding outliers**

In [None]:
!pip install faker -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m29.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from sklearn.preprocessing import Binarizer
from sklearn.impute import SimpleImputer

np.random.seed(42)

# creating the fakers object
faker = Faker()

# Define the dataset
ages = np.random.randint(18, 100, size=100)
genders = np.random.choice(['F', 'M'], size=100, p=[0.50, 0.50])
income = np.random.randint(1000, 10000, size=100)
education = np.random.choice(['High School', 'Bachelor', 'Master', 'PhD', np.nan], size=100)
marital_status = np.random.choice(['Married', 'Single', 'Divorced', 'Widowed'], size=100)
num_friends = np.random.randint(0, 100, size=100)
num_purchases = np.random.randint(0, 1000, size=100)
credit_score = np.random.randint(300, 850, size=100)
num_children = np.random.randint(0, 10, size=100)
city = np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], size=100, p=[0.20, 0.20, 0.20, 0.20, 0.20])

labels = []
for age, gender in zip(ages, genders):
    if age < 50 and gender == 'F':
        labels.append(1)
    elif age < 50 and gender == 'M':
        labels.append(0)
    elif age >= 50 and gender == 'F':
        labels.append(0)
    else:
        labels.append(1)
labels = np.array(labels)

# Create the dataframe
sample_data_4 = pd.DataFrame({
    'age': ages,
    'gender': genders,
    'income': income,
    'education': education,
    'marital_status': marital_status,
    'num_friends': num_friends,
    'num_purchases': num_purchases,
    'credit_score': credit_score,
    'num_children': num_children,
    'city': city,
    'label': labels
})

# Add new features
sample_data_4['occupation'] = np.random.choice(['Teacher', 'Engineer', 'Doctor', 'Lawyer', 'Accountant',
                                                'Artist', 'Writer', 'Salesperson', 'Manager', 'Student'], size=100)
sample_data_4['height'] = np.random.randint(150, 200, size=100)
sample_data_4['bio'] = [faker.sentence() for _ in range(100)]
sample_data_4['birthdate'] = pd.to_datetime(np.random.choice(
    pd.date_range('1950-01-01', '2005-12-31'), size=100))

# Introduce missing values
for i in range(20):
    data_index = np.random.randint(0, 100)
    column_index = np.random.randint(0, 11)
    sample_data_4.iloc[data_index, column_index] = np.nan

# Introduce outliers
for i in range(10):
    data_index = np.random.randint(0, 100)
    column_index = np.random.choice([2, 6, 8])
    sample_data_4.iloc[data_index, column_index] = np.random.choice(
        [500000, -500000, -1])

# Create new feature from existing one
binarizer = Binarizer(threshold=50)

# Impute missing values with mean
imputer = SimpleImputer(strategy='mean')
sample_data_4['age'] = imputer.fit_transform(sample_data_4[['age']])

# Apply binarizer
sample_data_4['age_group'] = binarizer.fit_transform(
    sample_data_4[['age']])

display(sample_data_4)

In [None]:
sample_data_4.isna().sum()

## **Creating the Dataset using `sklearn.dataset`**

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression

# Generate regression dataset
X, y = make_regression(n_samples=100, n_features=1, random_state=42)

# Plot the data
plt.scatter(X, y)
plt.xlabel('X')
plt.ylabel('y')
plt.title('Regression Data')
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

# Generate classification dataset
X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, n_classes=2, random_state=42)

# Plot the data
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Classification Data')
plt.show()