In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the dataset
file_path = 'winequality-combined.csv'
data = pd.read_csv(file_path)

# Separate features and target variable
X = data.drop('quality', axis=1)
y = data['quality']

#scale data 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE to generate synthetic samples and balance the dataset
smote = SMOTE(k_neighbors=1)
X_sm, y_sm = smote.fit_resample(X_train, y_train)

# Combine the balanced features and target variable into a single DataFrame
balanced_data = pd.DataFrame(X_sm, columns=X.columns)
balanced_data['quality'] = y_sm

# Save the balanced dataset to a new CSV file
balanced_file_path = 'balancedWINE.csv'
balanced_data.to_csv(balanced_file_path, index=False)

print(f"Balanced dataset saved to {balanced_file_path}")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Balanced dataset saved to balancedWINE.csv


In [6]:
import pandas as pd

original_file_path = 'winequality-combined.csv'  
balanced_file_path = 'balancedWINE.csv'  

# Load the original dataset
original_data = pd.read_csv(original_file_path)

# Load the balanced dataset
balanced_data = pd.read_csv(balanced_file_path)

# Display the last few entries of both datasets
print("\n original dataset last entries:")
print(original_data.tail())

print("\n balanced dataset last entries:")
print(balanced_data.tail())


 original dataset last entries:
      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
6492            6.2              0.21         0.29             1.6      0.039   
6493            6.6              0.32         0.36             8.0      0.047   
6494            6.5              0.24         0.19             1.2      0.041   
6495            5.5              0.29         0.30             1.1      0.022   
6496            6.0              0.21         0.38             0.8      0.020   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
6492                 24.0                  92.0  0.99114  3.27       0.50   
6493                 57.0                 168.0  0.99490  3.15       0.46   
6494                 30.0                 111.0  0.99254  2.99       0.46   
6495                 20.0                 110.0  0.98869  3.34       0.38   
6496                 22.0                  98.0  0.98941  3.26       0.32   

      alcohol  qu

In [4]:
import pandas as pd
from scipy import stats
import numpy as np

# Load the balanced dataset from 'balancedWINE.csv'
file_path = 'balancedWINE.csv' 
balanced_data = pd.read_csv(file_path)

# Calculate Z-scores for each numeric feature in the DataFrame
z_scores = np.abs(stats.zscore(balanced_data.select_dtypes(include=[np.number])))

# Define a Z-score threshold to identify outliers
threshold = 3

# Identify rows where all features have Z-scores within the threshold (no outliers)
non_outlier_indices = (z_scores < threshold).all(axis=1)

# Filter the DataFrame to only include rows without outliers
data_without_outliers = balanced_data[non_outlier_indices]

# Save the dataset without outliers to a new CSV file
outliers_removed_file_path = 'balancedOUTLIER.csv'  
data_without_outliers.to_csv(outliers_removed_file_path, index=False)

# Print a message to confirm completion
print(f"Dataset with outliers removed has been saved to {outliers_removed_file_path}.")
print(f"New dataset contains {data_without_outliers.shape[0]} rows and {data_without_outliers.shape[1]} columns.")

Dataset with outliers removed has been saved to balancedOUTLIER.csv.
New dataset contains 14124 rows and 12 columns.
