In [None]:
"""WOrking on CM1 dataset"""

In [36]:
# Loading CM1 folder
import pandas as pd
import os

# Loading folder path
CM1_path = r"C:\Local Disk (A)\Github Files\Projects\Mass-Sceptra-Classification\Datasets\CM1"
data = []

# Combining all the files in the CM1 folder
for file in os.listdir(CM1_path):
    if file.endswith('.csv'):
        file_path = os.path.join(CM1_path, file)
        df = pd.read_csv(file_path, skiprows=1) # skipping the first row
        # Append to the list of DataFrames
        data.append(df)
# Concatenate all DataFrames into a single DataFrame
combined_CM1_data = pd.concat(data, ignore_index=True)

print(combined_CM1_data.head(5))
print("Rows & columns:", combined_CM1_data.shape)


   #Point  X(Thompsons)    Y(Counts)
0       0          40.0    46.235340
1       1          40.9    58.811787
2       2          41.0   617.287781
3       3          41.2    62.857056
4       4          42.1  3057.952637
Rows & columns: (92310, 3)


In [None]:
# Preprocessing Data

# summary of the datset
print("Summary :")
summary = combined_CM1_data.describe()
print(summary)

# Cheking for missing values
print("")
print("Cheking for missing values :")
is_null = combined_CM1_data.isnull().sum()
print(is_null)

# dropping "#Point" column as it is not relevant
print("")
print("After droping '#Point' column :")
combined_CM1_data = combined_CM1_data.drop('#Point', axis= 1)
print(combined_CM1_data.head(5))

# Rounding up the values of X(Thompsons) column
print("")
print("After Rounding :")
import numpy as np
combined_CM1_data['X(Thompsons)'] = np.ceil(combined_CM1_data['X(Thompsons)'])
print(combined_CM1_data.head(5))

# Adding features

# Import necessary libraries for feature calculations
import numpy as np
from scipy.spatial.distance import cosine

# Adding features
print("\nCreating new features...")

# Define common m/z axis points to align the data
common_mz = np.linspace(combined_CM1_data['X(Thompsons)'].min(), combined_CM1_data['X(Thompsons)'].max(), 1000)

# Create empty lists to store feature values for each row
cosine_similarities = []
area_ratios = []
std_ratios = []

# Iterate through each row in the dataset
for i in range(len(combined_CM1_data)):
    # Extract the current row values
    x_val = combined_CM1_data.loc[i, 'X(Thompsons)']
    y_val = combined_CM1_data.loc[i, 'Y(Counts)']

    # Align current row's Y(Counts) onto the common m/z axis using interpolation
    interpolated_y = np.interp(common_mz, [x_val], [y_val])

    # Calculate cosine similarity (since only one point is interpolated, cosine similarity will be 1)
    cosine_similarities.append(1.0)

    # Calculate area ratio (area will be the same as the only point is aligned)
    area_original = y_val
    area_interpolated = interpolated_y[0]
    area_ratios.append(min(area_original / area_interpolated, area_interpolated / area_original) if area_original > 0 and area_interpolated > 0 else 0)

    # Calculate standard deviation ratio (since it's a single value, ratio is 1)
    std_ratios.append(1.0)

# Add the new features to the DataFrame
combined_CM1_data['cosine_similarity'] = cosine_similarities
combined_CM1_data['area_ratio'] = area_ratios
combined_CM1_data['std_ratio'] = std_ratios

# Display the DataFrame with the new features
print("\nAfter adding new features:")
print(combined_CM1_data.head())

# Normalizing the dataset
"""print("")
print("After Normalization :")
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

scaler = MinMaxScaler()
columns_to_normalize = ['X(Thompsons)', 'Y(Counts)']
combined_CM1_data[columns_to_normalize] = scaler.fit_transform(combined_CM1_data[columns_to_normalize])
print(combined_CM1_data.head(5))"""

Summary :
             #Point  X(Thompsons)      Y(Counts)
count  92310.000000  92310.000000   92310.000000
mean     171.147308    143.243818    1193.070302
std      107.140743     68.833009    9722.745794
min        0.000000     39.900000       0.003136
25%       82.000000     88.200000      22.855255
50%      164.000000    134.000000      70.941631
75%      248.000000    186.100000     283.324089
max      521.000000    413.200000  654898.562500

Cheking for missing values :
#Point          0
X(Thompsons)    0
Y(Counts)       0
dtype: int64

After droping '#Point' column :
   X(Thompsons)    Y(Counts)
0          40.0    46.235340
1          40.9    58.811787
2          41.0   617.287781
3          41.2    62.857056
4          42.1  3057.952637

After Rounding :
   X(Thompsons)    Y(Counts)
0          40.0    46.235340
1          41.0    58.811787
2          41.0   617.287781
3          42.0    62.857056
4          43.0  3057.952637

Creating common m/z axis and aligning spectra...

Ca

ValueError: shapes (92310,) and (1000,) not aligned: 92310 (dim 0) != 1000 (dim 0)

In [30]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split dataset into features and target
X = combined_CM1_data[['X(Thompsons)']]  # Features: X(Thompsons)
y = combined_CM1_data['Y(Counts)']  # Target: Y(Counts)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

# Initialize LightGBM classifier
model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.05, n_estimators=100)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [35]:
# Convert Y(Counts) into categories
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

bins = [0, 100, 1000, float('inf')]  # Define ranges for bins (adjust as needed)
labels = ['Low', 'Medium', 'High']  # Define labels for bins
combined_CM1_data['Y(Counts)_category'] = pd.cut(combined_CM1_data['Y(Counts)'], bins=bins, labels=labels)
print(combined_CM1_data.head())

# Use the new categorical target for classification
X = combined_CM1_data[['X(Thompsons)']]  # Feature: X(Thompsons)
y = combined_CM1_data['Y(Counts)_category']  # Target: Y(Counts) as categories

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

# Initialize LightGBM Classifier
model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, learning_rate=0.05, n_estimators=100)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


   X(Thompsons)    Y(Counts) Y(Counts)_category
0          40.0    46.235340                Low
1          41.0    58.811787                Low
2          41.0   617.287781             Medium
3          42.0    62.857056                Low
4          43.0  3057.952637               High
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 64617, number of used features: 1
[LightGBM] [Info] Start training from score -2.039175
[LightGBM] [Info] Start training from score -0.557073
[LightGBM] [Info] Start training from score -1.214088
Accuracy: 0.59


In [None]:
"""WOrking on CM2 dataset"""

In [None]:
"""WOrking on CM3 dataset"""

In [None]:
"""WOrking on CM1,CM2,CM3 combined"""