In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset 1

In [17]:
data_1 = pd.read_csv("03_Clustering_Marketing.csv")
data_1.head()

Unnamed: 0,gradyear,gender,age,NumberOffriends,basketball,football,soccer,softball,volleyball,swimming,...,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
0,2007,,,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2007,F,17.41,49,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2007,F,17.511,41,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1
3,2006,F,,36,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2008,F,16.657,1,0,0,0,0,0,1,...,0,0,0,3,0,0,0,0,0,0


Missing value & duplicated data

In [18]:
len(data_1)

15000

In [19]:
data_1.isna().sum().sort_values(ascending=False)

age                2496
gender             1337
gradyear              0
NumberOffriends       0
basketball            0
football              0
soccer                0
softball              0
volleyball            0
swimming              0
cheerleading          0
baseball              0
tennis                0
sports                0
cute                  0
sex                   0
sexy                  0
hot                   0
kissed                0
dance                 0
band                  0
marching              0
music                 0
rock                  0
god                   0
church                0
jesus                 0
bible                 0
hair                  0
dress                 0
blonde                0
mall                  0
shopping              0
clothes               0
hollister             0
abercrombie           0
die                   0
death                 0
drunk                 0
drugs                 0
dtype: int64

In [20]:
data_1.dropna(inplace=True)
data_1.isna().sum().sort_values(ascending=False)

gradyear           0
gender             0
age                0
NumberOffriends    0
basketball         0
football           0
soccer             0
softball           0
volleyball         0
swimming           0
cheerleading       0
baseball           0
tennis             0
sports             0
cute               0
sex                0
sexy               0
hot                0
kissed             0
dance              0
band               0
marching           0
music              0
rock               0
god                0
church             0
jesus              0
bible              0
hair               0
dress              0
blonde             0
mall               0
shopping           0
clothes            0
hollister          0
abercrombie        0
die                0
death              0
drunk              0
drugs              0
dtype: int64

In [21]:
len(data_1)

12042

In [22]:
data_1.duplicated().sum()

np.int64(9)

In [23]:
data_1 = data_1.drop_duplicates()

In [24]:
data_1.duplicated().sum()

np.int64(0)

Feature Extraction & Engineering

In [26]:
data_1.head()

Unnamed: 0,gradyear,gender,age,NumberOffriends,basketball,football,soccer,softball,volleyball,swimming,...,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
1,2007,F,17.41,49,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,2007,F,17.511,41,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1
4,2008,F,16.657,1,0,0,0,0,0,1,...,0,0,0,3,0,0,0,0,0,0
5,2008,M,18.034,32,0,5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2006,M,18.53,18,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [27]:
categorical_cols = [var for var in data_1.columns if data_1[var].dtype == 'object']

categorical_cols

['gender', 'age']

In [31]:
for col in categorical_cols:
  print(data_1[col].value_counts())
  print('-'*50)

gender
F    9703
M    2330
Name: count, dtype: int64
--------------------------------------------------
age
16. Jun    24
17.399     20
16.591     20
17.443     19
17.311     19
           ..
106.439     1
19.732      1
20.025      1
106.248     1
14.209      1
Name: count, Length: 1887, dtype: int64
--------------------------------------------------


In [37]:
# Convert gender column to 1 for 'F' and 0 for 'M'
data_1['gender'] = data_1['gender'].map({'F': 1, 'M': 0})

# Check the transformation
print(data_1['gender'].value_counts())

gender
1    9703
0    2330
Name: count, dtype: int64


In [39]:
# Convert the 'age' column to string to allow the use of .str accessor
data_1['age'] = data_1['age'].astype(str)

# Extract numeric part of 'age' using regular expression, ignoring non-numeric parts
data_1['age'] = data_1['age'].str.extract(r'(\d+\.\d+|\d+)', expand=False)  # Extract numbers with decimal or integers

# Convert to numeric, forcing errors to NaN
data_1['age'] = pd.to_numeric(data_1['age'], errors='coerce')

# Round the age values to the nearest integer
data_1['age'] = data_1['age'].round().astype(int)

# Check the transformation
print(data_1['age'].head())

1    17
2    17
4    16
5    18
6    18
Name: age, dtype: int64


Dataset 2

In [3]:
data_2 = pd.read_csv("Spellman.csv")
data_2.head()

Unnamed: 0,time,40,50,60,70,80,90,100,110,120,...,170,180,190,200,210,220,230,240,250,260
0,YAL001C,-0.07,-0.23,-0.1,0.03,-0.04,-0.12,-0.28,-0.44,-0.09,...,0.59,0.34,-0.28,-0.09,-0.44,0.31,0.03,0.57,0.0,0.01
1,YAL014C,0.215,0.09,0.025,-0.04,-0.04,-0.02,-0.51,-0.08,0.0,...,-0.3,-0.38,0.07,-0.04,0.13,-0.06,-0.26,-0.1,0.27,0.235
2,YAL016W,0.15,0.15,0.22,0.29,-0.1,0.15,-0.73,0.19,-0.15,...,0.12,-0.17,0.11,-0.15,0.03,-0.26,-0.34,-0.34,0.25,0.19
3,YAL020C,-0.35,-0.28,-0.215,-0.15,0.16,-0.12,0.26,0.0,0.13,...,0.07,0.61,-0.2,0.49,-0.43,0.8,-0.47,1.01,-0.36,-0.405
4,YAL022C,-0.415,-0.59,-0.58,-0.57,-0.09,-0.34,0.49,0.32,1.15,...,-0.48,-0.4,-0.59,0.54,-0.09,1.03,0.08,0.57,-0.26,-0.31


In [9]:
data_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4381 entries, 0 to 4380
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    4381 non-null   object 
 1   40      4381 non-null   float64
 2   50      4381 non-null   float64
 3   60      4381 non-null   float64
 4   70      4381 non-null   float64
 5   80      4381 non-null   float64
 6   90      4381 non-null   float64
 7   100     4381 non-null   float64
 8   110     4381 non-null   float64
 9   120     4381 non-null   float64
 10  130     4381 non-null   float64
 11  140     4381 non-null   float64
 12  150     4381 non-null   float64
 13  160     4381 non-null   float64
 14  170     4381 non-null   float64
 15  180     4381 non-null   float64
 16  190     4381 non-null   float64
 17  200     4381 non-null   float64
 18  210     4381 non-null   float64
 19  220     4381 non-null   float64
 20  230     4381 non-null   float64
 21  240     4381 non-null   float64
 22  

In [10]:
data_2.isnull().sum()

time    0
40      0
50      0
60      0
70      0
80      0
90      0
100     0
110     0
120     0
130     0
140     0
150     0
160     0
170     0
180     0
190     0
200     0
210     0
220     0
230     0
240     0
250     0
260     0
dtype: int64

Dataset 3

In [4]:
data_3 = pd.read_csv("Spellman.csv")
data_3.head()

Unnamed: 0,time,40,50,60,70,80,90,100,110,120,...,170,180,190,200,210,220,230,240,250,260
0,YAL001C,-0.07,-0.23,-0.1,0.03,-0.04,-0.12,-0.28,-0.44,-0.09,...,0.59,0.34,-0.28,-0.09,-0.44,0.31,0.03,0.57,0.0,0.01
1,YAL014C,0.215,0.09,0.025,-0.04,-0.04,-0.02,-0.51,-0.08,0.0,...,-0.3,-0.38,0.07,-0.04,0.13,-0.06,-0.26,-0.1,0.27,0.235
2,YAL016W,0.15,0.15,0.22,0.29,-0.1,0.15,-0.73,0.19,-0.15,...,0.12,-0.17,0.11,-0.15,0.03,-0.26,-0.34,-0.34,0.25,0.19
3,YAL020C,-0.35,-0.28,-0.215,-0.15,0.16,-0.12,0.26,0.0,0.13,...,0.07,0.61,-0.2,0.49,-0.43,0.8,-0.47,1.01,-0.36,-0.405
4,YAL022C,-0.415,-0.59,-0.58,-0.57,-0.09,-0.34,0.49,0.32,1.15,...,-0.48,-0.4,-0.59,0.54,-0.09,1.03,0.08,0.57,-0.26,-0.31


In [11]:
data_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4381 entries, 0 to 4380
Data columns (total 24 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    4381 non-null   object 
 1   40      4381 non-null   float64
 2   50      4381 non-null   float64
 3   60      4381 non-null   float64
 4   70      4381 non-null   float64
 5   80      4381 non-null   float64
 6   90      4381 non-null   float64
 7   100     4381 non-null   float64
 8   110     4381 non-null   float64
 9   120     4381 non-null   float64
 10  130     4381 non-null   float64
 11  140     4381 non-null   float64
 12  150     4381 non-null   float64
 13  160     4381 non-null   float64
 14  170     4381 non-null   float64
 15  180     4381 non-null   float64
 16  190     4381 non-null   float64
 17  200     4381 non-null   float64
 18  210     4381 non-null   float64
 19  220     4381 non-null   float64
 20  230     4381 non-null   float64
 21  240     4381 non-null   float64
 22  

In [12]:
data_3.isnull().sum()

time    0
40      0
50      0
60      0
70      0
80      0
90      0
100     0
110     0
120     0
130     0
140     0
150     0
160     0
170     0
180     0
190     0
200     0
210     0
220     0
230     0
240     0
250     0
260     0
dtype: int64

Dataset 4

In [5]:
data_4 = pd.read_csv("wine-clustering.csv")
data_4.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [13]:
data_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Alcohol               178 non-null    float64
 1   Malic_Acid            178 non-null    float64
 2   Ash                   178 non-null    float64
 3   Ash_Alcanity          178 non-null    float64
 4   Magnesium             178 non-null    int64  
 5   Total_Phenols         178 non-null    float64
 6   Flavanoids            178 non-null    float64
 7   Nonflavanoid_Phenols  178 non-null    float64
 8   Proanthocyanins       178 non-null    float64
 9   Color_Intensity       178 non-null    float64
 10  Hue                   178 non-null    float64
 11  OD280                 178 non-null    float64
 12  Proline               178 non-null    int64  
dtypes: float64(11), int64(2)
memory usage: 18.2 KB


In [14]:
data_4.isnull().sum()

Alcohol                 0
Malic_Acid              0
Ash                     0
Ash_Alcanity            0
Magnesium               0
Total_Phenols           0
Flavanoids              0
Nonflavanoid_Phenols    0
Proanthocyanins         0
Color_Intensity         0
Hue                     0
OD280                   0
Proline                 0
dtype: int64