In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

## CODE.py

In [None]:
# load the dataset into a pandas DataFrame
df = pd.read_csv('datasets/dataset.csv')

# print the number of rows and columns in the dataset
print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])

# calculate the skewness and kurtosis of each column
skewness = df.skew()
kurtosis = df.kurt()

# print the skewness and kurtosis of each column
print('Skewness:')
print(skewness)
print('Kurtosis:')
print(kurtosis)

# plot a histogram of each column
for column in df.columns:
    plt.hist(df[column])
    plt.title(column)
    plt.show()

# plot a scatter plot of each pair of columns
for i in range(df.shape[1]-1):
    for j in range(i+1, df.shape[1]):
        plt.scatter(df.iloc[:,i], df.iloc[:,j])
        plt.xlabel(df.columns[i])
        plt.ylabel(df.columns[j])
        plt.show()


## K-mean.py

In [None]:
# Generate random data for clustering
X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# Visualize the data
plt.scatter(X[:, 0], X[:, 1], s=50);

# Create K-means clustering model with 4 clusters
kmeans = KMeans(n_clusters=4)

# Fit the data to the model
kmeans.fit(X)

# Get the cluster labels and centroids
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# Visualize the clusters and centroids
colors = ['r', 'g', 'b', 'y']
for i in range(len(X)):
    plt.scatter(X[i][0], X[i][1], c=colors[labels[i]], s=50)

plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, c='#050505')
plt.show()


## Miss.py

In [None]:
# Load data from a CSV file
data = pd.read_csv('data.csv')

# Check for incorrect data types
data['age'] = pd.to_numeric(data['age'], errors='coerce')
data['is_active'] = data['is_active'].astype(bool)

# Check for irrelevant variables and drop them
data = data.drop(columns=['id'])

# Check for missing values and fill them with mean or median
mean_age = data['age'].mean()
data['age'].fillna(mean_age, inplace=True)

median_income = data['income'].median()
data['income'].fillna(median_income, inplace=True)

# Check for duplicates and drop them
data.drop_duplicates(inplace=True)

# Print the cleaned data
print(data.head())


## Time.py

In [None]:
# Load time series data from a CSV file
data = pd.read_csv('data.csv', parse_dates=['date'], index_col='date')

# Print the first few rows of the data
print(data.head())

# Resample the data to monthly frequency
monthly_data = data.resample('M').mean()

# Visualize the time series data
plt.plot(monthly_data)
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Time Series Data')
plt.show()

# Calculate the rolling mean and standard deviation
rolling_mean = monthly_data.rolling(window=12).mean()
rolling_std = monthly_data.rolling(window=12).std()

# Visualize the rolling statistics
plt.plot(monthly_data, color='blue', label='Original')
plt.plot(rolling_mean, color='red', label='Rolling Mean')
plt.plot(rolling_std, color='black', label='Rolling Std')
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Rolling Statistics')
plt.legend()
plt.show()

# Perform time series decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

decomposition = seasonal_decompose(monthly_data)

trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

# Visualize the decomposed components
plt.subplot(411)
plt.plot(monthly_data, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
