# A Collection of Useful Python Snippets

## Download Libraries

In [None]:
# Run once in every environment
!pip install wordcloud
!pip install seaborn
!pip install plotly

## Loading Data

### From Local .csv

In [None]:
import pandas as pd
data = pd.read_csv("data.csv")

### From Google Drive

In [None]:
import requests
import pandas as pd
from io import BytesIO

# where to find file ID
# https://drive.google.com/file/d/13x2amKB3smbbh0P3yk2djhqBF-D5R4Dz/view?usp=sharing
#                            id = 13x2amKB3smbbh0P3yk2djhqBF-D5R4Dz

file_id = "" # input file ID
file_url = f"https://drive.google.com/uc?id={file_id}"
response = requests.get(file_url)

if response.status_code == 200:
    content = BytesIO(response.content)
    data = pd.read_csv(content)
    print(data.head())
else:
    print("Failed to download the file.")

## Exploratory Data Analysis & Data Visualization

### Simple EDA

In [None]:
data.info()
data.head()
data.tail()
data.describe().T
data.isnull().sum()
data.isna().sum()
data.describe().T.plot(kind='bar')
data.describe(include='all').T
data.duplicated().sum()
data.columns.to_list()


### View Output Data Imbalance

In [None]:
print(data['Y'].value_counts())
data['Y'].value_counts().plot(kind='barh')

### Correlation Matrix (bundle)

In [None]:
data.corrwith(data["Y"])

In [None]:
data.corr()

In [None]:
# Pearson correlation matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

corr = data.corr(method = 'pearson')
mask = np.array(corr)

mask[np.tril_indices_from(mask)] = False
fig, ax = plt.subplots(figsize = (15,12))
fig.set_size_inches(70,12)
sns.heatmap(corr, mask = mask, vmax = 0.9, square = True, annot = True)

In [None]:
numeric_cols = data.select_dtypes(include=np.number).columns  
plt.figure(figsize=(12, 8))
sns.heatmap(data[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Sort Correlated Features

In [None]:
# Pilih fitur dengan tingkat korelasi yang tinggi dengan fitur target: kolom 'Class'
correlated_features = corr['Y'].sort_values(ascending=False, key=lambda x: abs(x))
print("Features sorted by correlation with the target:\n", correlated_features)

### Names of All Non-Numeric Columns along with unique values

In [None]:
non_numeric_columns = data.select_dtypes(exclude='number').columns
for col in non_numeric_columns:
    print(f"Unique values in column {col}:")
    print(data[col].unique())

### Plotting of All Columns (Numeric)

In [None]:
import seaborn as sns

sns.pairplot(data, kind='reg', plot_kws={'line_kws': {'color': 'red'}})

### Plotting of Categorical Columns

In [None]:
# Cathegories of interviewed
import matplotlib.pyplot as plt

categories = ['X', 'Y']
n_row = 2 # number of rows
n_col = 2 # number of columns

fig, axes = plt.subplots(nrows=n_row, ncols=n_col, figsize=(12, 8))

for i, category in enumerate(categories):
    row = i // n_col
    col = i % n_col

    counts = data[category].value_counts()
    
    axes[row, col].bar(counts.index, counts)
    axes[row, col].set_title(f'{category} distribution')
    axes[row, col].set_xlabel(category)
    axes[row, col].set_ylabel('count')
    
    axes[row, col].set_xticklabels(counts.index, rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
from plotly import express

for column in ['X', 'Y']:
    express.histogram(data_frame=data, x=column).show()

In [None]:
from wordcloud import WordCloud

reviews_text = ' '.join(data['Y'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(reviews_text)
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Reviews')
plt.axis('off')

### Display Rows with Null

In [None]:
# Display rows with any NaN values
null_rows = data[data.isnull().any(axis=1)]

# Display the result
null_rows


## Preprocessing

### Mapping Non-Numeric to Numeric

In [None]:
mapping = {'x': 1, 'y': 2, 'z': 3} 
columns_to_map = ['col1', 'col2', 'col3']
for column in columns_to_map:
    data[column] = data[column].map(mapping)
    # data[column] = data[column].map(mapping).fillna(0) # if null = 0

### Dropping Useless Columns

In [None]:
columns = ['X', 'Y']
data.drop(columns, axis=1 ,inplace=True)

### Dropping Row Based on Condition

In [None]:
data = data[data['Y'] != xyz] # xyz is the condition

### Renaming Columns

In [None]:
data.columns = ['X', 'Y']

### Test-Train Split

In [None]:
from sklearn.model_selection import train_test_split

ycol = "Y" # y column
X = data.drop(columns=[ycol])
y = data[ycol]
r_state = 42
t_size = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                	test_size=t_size,
                                                	random_state=r_state,
                                                	)

### Test-Validation Split

In [None]:
from sklearn.model_selection import train_test_split

X = X_test
y = y_test
r_state = 42
t_size = 0.2

X_test, X_val, y_test, y_val = train_test_split(X, y,
                                            	test_size=t_size,
                                            	random_state=r_state)

### Train-Test-Validation Split

In [None]:
X = X_test
y = y_test
r_state = 42
t_size = 0.2
v_size = 0.5

X_train, X_temp, y_train, y_temp = train_test_split(X, y,
                                                	test_size=t_size,
                                                	random_state=r_state,)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp,
                                            	test_size=t_size,
                                            	random_state=r_state)

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

## Modelling & Evaluation

### Simple model with accuracy, precision, recall

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
# from sklearn.() import () # import model

model = () # declare model here
model.fit(X_train, y_train)
# model_pred_val = model.predict(X_val)
model_pred_test= model.predict(X_test)
# model_pred_val = pd.DataFrame(model_pred_val)
model_pred_test = pd.DataFrame(model_pred_test)
accuracy = accuracy_score(y_test, model_pred_test)
precision = precision_score(y_test, model_pred_test)
recall = recall_score(y_test, model_pred_test)

print("Model X: ")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

models = [model]
labels = ['Model X']

for model, label in zip(models, labels):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {label}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


### TF Keras Neural Network

In [None]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation="relu", input_dim=X_train.shape[1]),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.02)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(8, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.02)),
    tf.keras.layers.Dense(1, activation="linear")
])

model.compile(optimizer='adam', loss='mae', metrics=['mean_squared_error'])
history = model.fit(X_train, y_train, epochs=100, batch_size=256, validation_data=(X_val, y_val))