In [4]:
import pandas as pd
import numpy as np

# Step 1: Load dataset
data = pd.read_csv('/content/drive/MyDrive/RegresiUTSTelkom.csv')  # Replace with your dataset path

# # Rename columns to x1, x2, ..., x91
column_names = [f'x{i}' for i in range(1, data.shape[1] + 1)]
data.columns = column_names

# Step 2: Drop duplicate rows
data = data.drop_duplicates()
print(f"Dataset shape after dropping duplicates: {data.shape}")

# Step 3: Transform target (log-transform x20)
data['x20_log'] = np.log1p(data['x20'])  # Transform target to reduce skewness
target = data['x20_log']
data = data.drop(columns=['x20', 'x20_log'])  # Drop original and transformed target for feature selection

# Step 4: Select features based on correlation with x20_log
correlation_threshold = 0.1
correlation_with_target = data.corrwith(target).abs()  # Compute absolute correlation
selected_features = correlation_with_target[correlation_with_target > correlation_threshold].index
data_selected = data[selected_features]

# Step 5: Apply Variance Threshold to selected features
from sklearn.feature_selection import VarianceThreshold

variance_threshold = 0.1
selector = VarianceThreshold(threshold=variance_threshold)
data_high_variance = selector.fit_transform(data_selected)

# Convert back to DataFrame
data_final = pd.DataFrame(data_high_variance, columns=[col for col, keep in zip(data_selected.columns, selector.get_support()) if keep])

# Step 6: Calculate skewness and remove features with high skewness
skewness = data_final.skew().sort_values(ascending=False)
print("Skewness before removing high-skewed features:")
print(skewness)

# Remove features with skewness > 1 or skewness < -1
high_skewed_features = skewness[(skewness > 1) | (skewness < -1)].index
print(f"Removing features with high skewness: {list(high_skewed_features)}")
data_final = data_final.drop(columns=high_skewed_features)

# Step 7: Save the reduced dataset
processed_file_path = "/content/RegresiUTSTelkom_cleaned.csv"
data_final.to_csv(processed_file_path, index=False)

print("Processed dataset saved as:", processed_file_path)

Dataset shape after dropping duplicates: (515130, 91)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['x20_log'] = np.log1p(data['x20'])  # Transform target to reduce skewness


Skewness before removing high-skewed features:
x59    4.510444
x25    4.171404
x17    3.450913
x22    3.283011
x24    3.139351
x18    3.102336
x70    3.081634
x15    2.926859
x21    2.669088
x19    2.248271
x16    2.222596
x82    2.187223
x27    2.136054
x14    2.053276
x23    1.997108
x84    1.690973
x58    1.673140
x57    1.600399
x49    1.453871
x38    1.446828
x60    1.318823
x34    1.318001
x37    1.208768
x53    1.015985
x5     0.974358
x7     0.846846
x64    0.832200
x39    0.519905
x68    0.326747
x47    0.156928
x85    0.106440
x65    0.089468
x6     0.068126
x4    -0.153239
x12   -0.193047
x10   -0.256650
x32   -0.516374
x76   -0.661759
x51   -0.853213
x3    -0.860269
x2    -0.895416
x81   -1.002561
x48   -1.100670
x83   -1.365475
x77   -1.600167
x72   -2.069324
x56   -2.250034
x40   -2.454756
x44   -3.273497
x80   -4.339337
dtype: float64
Removing features with high skewness: ['x59', 'x25', 'x17', 'x22', 'x24', 'x18', 'x70', 'x15', 'x21', 'x19', 'x16', 'x82', 'x27', 'x14', '