In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

file_path = r"C:\Users\nlpsw\Downloads\sample1.csv"
df = pd.read_csv(file_path)


features = ['OHQ030', 'OHQ033', 'OHQ770', 'OHQ780A', 'OHQ780B', 'OHQ780C', 'OHQ780D', 'OHQ780E', 
            'OHQ780F', 'OHQ780G', 'OHQ780H', 'OHQ780I', 'OHQ780J', 'OHQ780K', 'OHQ555G', 'OHQ555Q', 
            'OHQ555U', 'OHQ560G', 'OHQ560Q', 'OHQ560U', 'OHQ566', 'OHQ571Q', 'OHQ571U', 'OHQ576G', 
            'OHQ576Q', 'OHQ576U', 'OHQ610', 'OHQ612', 'OHQ620', 'OHQ640', 'OHQ835', 'OHQ848G', 
            'OHQ848Q', 'OHQ849', 'OHQ850', 'OHQ860', 'OHQ870']
target = 'OHQ845' 

df = df[df[target].notna()]

label_encoders = {}
for feature in features:
    if df[feature].dtype == 'object':
        label_encoders[feature] = LabelEncoder()
        df[feature] = label_encoders[feature].fit_transform(df[feature].astype(str))


imputer = SimpleImputer(strategy='mean')  
X = imputer.fit_transform(df[features])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)


Mean Absolute Error: 0.7945470340765236
Mean Squared Error: 0.995970415177171
R-squared: 0.3102149640908748


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


file_path = r"C:\Users\nlpsw\Downloads\sample1.csv"
df = pd.read_csv(file_path)


features = ['OHQ030', 'OHQ033', 'OHQ770', 'OHQ780A', 'OHQ780B', 'OHQ780C', 'OHQ780D', 'OHQ780E', 
            'OHQ780F', 'OHQ780G', 'OHQ780H', 'OHQ780I', 'OHQ780J', 'OHQ780K', 'OHQ555G', 'OHQ555Q', 
            'OHQ555U', 'OHQ560G', 'OHQ560Q', 'OHQ560U', 'OHQ566', 'OHQ571Q', 'OHQ571U', 'OHQ576G', 
            'OHQ576Q', 'OHQ576U', 'OHQ610', 'OHQ612', 'OHQ620', 'OHQ640', 'OHQ835', 'OHQ848G', 
            'OHQ848Q', 'OHQ849', 'OHQ850', 'OHQ860', 'OHQ870']
target = 'OHQ845'  

df = df[df[target].notna()]


label_encoders = {}
for feature in features:
    if df[feature].dtype == 'object':
        label_encoders[feature] = LabelEncoder()
        df[feature] = label_encoders[feature].fit_transform(df[feature].astype(str))


imputer = SimpleImputer(strategy='mean')  # You can choose different strategies like 'median' or 'most_frequent'
X = imputer.fit_transform(df[features])
y = df[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestRegressor(random_state=42)

model.fit(X_train, y_train)


y_pred = model.predict(X_test)


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)


new_case = X_test[0].reshape(1, -1)  
predicted_rating = model.predict(new_case)[0]

if predicted_rating <= 2.5:
    risk_category = "Low risk"
else:
    risk_category = "High risk"

print(f"Predicted Rating: {predicted_rating}")
print(f"Risk Category: {risk_category}")


Mean Absolute Error: 0.7945470340765236
Mean Squared Error: 0.995970415177171
R-squared: 0.3102149640908748
Predicted Rating: 1.9604492805458753
Risk Category: Low risk


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Define the file path for the existing dataset
file_path = r"C:\Users\nlpsw\Downloads\sample1.csv"

# Load the existing dataset
df = pd.read_csv(file_path)

# Extract features and target variable
features = ['OHQ030', 'OHQ555G', 'OHQ560Q', 'OHQ566', 'OHQ620', 'OHQ835',
            'OHQ848Q', 'OHQ849', 'OHQ850', 'OHQ860', 'OHQ870']
target = 'OHQ620'

# Replace NaN values with 77 in both features and target
df[features] = df[features].fillna(77)
df[target] = df[target].fillna(77)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for feature in features:
    if df[feature].dtype == 'object':
        label_encoders[feature] = LabelEncoder()
        df[feature] = label_encoders[feature].fit_transform(df[feature].astype(str))

# Split data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, y_pred))

# Example of making predictions on new data



Accuracy: 0.9991565926342424
              precision    recall  f1-score   support

         1.0       0.94      1.00      0.97        34
         2.0       1.00      0.96      0.98        52
         3.0       1.00      1.00      1.00       181
         4.0       1.00      1.00      1.00       284
         5.0       1.00      1.00      1.00       395
         9.0       0.00      0.00      0.00         1
        77.0       1.00      1.00      1.00      2610

    accuracy                           1.00      3557
   macro avg       0.85      0.85      0.85      3557
weighted avg       1.00      1.00      1.00      3557



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Define the file path for the existing dataset
file_path = r"C:\Users\nlpsw\Downloads\sample1.csv"

# Load the existing dataset
df = pd.read_csv(file_path)

# Extract features and target variables
features = ['OHQ030', 'OHQ555G', 'OHQ560Q', 'OHQ566', 'OHQ620', 'OHQ835',
            'OHQ848Q', 'OHQ849', 'OHQ850', 'OHQ860', 'OHQ870']
targets = ['OHQ620', 'OHQ835']

# Replace NaN values with 77 in features and targets
df[features] = df[features].fillna(77)
for target in targets:
    df[target] = df[target].fillna(77)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for feature in features:
    if df[feature].dtype == 'object':
        label_encoders[feature] = LabelEncoder()
        df[feature] = label_encoders[feature].fit_transform(df[feature].astype(str))

# Convert targets to binary if they are continuous
def convert_to_binary(y, threshold=0.5):
    return (y > threshold).astype(int)

# Convert target variables to binary
df['OHQ620'] = convert_to_binary(df['OHQ620'])
df['OHQ835'] = convert_to_binary(df['OHQ835'])

# Split data into training and testing sets
X = df[features]
y1 = df['OHQ620']
y2 = df['OHQ835']
X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.5, random_state=42)

# Function to add Gaussian noise to features
def add_noise_to_features(X, noise_level=0.1):
    noise = np.random.normal(loc=0, scale=noise_level, size=X.shape)
    return X + noise

# Function to add noise to target labels
def add_noise_to_labels(y, noise_level=0.1):
    y_array = y.values if hasattr(y, 'values') else y  # Convert to NumPy array if necessary
    noisy_labels = y_array.copy()
    num_samples = len(y_array)
    num_noisy_labels = int(noise_level * num_samples)
    
    # Randomly choose indices to flip
    noisy_indices = np.random.choice(num_samples, num_noisy_labels, replace=False)
    
    # Flip the labels
    for idx in noisy_indices:
        noisy_labels[idx] = 1 - noisy_labels[idx]  # Assuming binary classification
    
    return noisy_labels

# Add noise to features
noise_level = 0.1 # Adjust noise level as needed
X_train_noisy = add_noise_to_features(X_train, noise_level)
X_test_noisy = add_noise_to_features(X_test, noise_level)

# Add noise to target labels
y1_train_noisy = add_noise_to_labels(y1_train, noise_level)
y1_test_noisy = add_noise_to_labels(y1_test, noise_level)
y2_train_noisy = add_noise_to_labels(y2_train, noise_level)
y2_test_noisy = add_noise_to_labels(y2_test, noise_level)

# Initialize Random Forest models for each target variable
model1 = RandomForestClassifier(n_estimators=100, random_state=42)
model2 = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the models
model1.fit(X_train_noisy, y1_train_noisy)
model2.fit(X_train_noisy, y2_train_noisy)

# Predict on test data for each target variable
y1_pred = model1.predict(X_test_noisy)
y2_pred = model2.predict(X_test_noisy)

# Calculate accuracies for each target variable
accuracy1 = accuracy_score(y1_test_noisy, y1_pred)
accuracy2 = accuracy_score(y2_test_noisy, y2_pred)

# Combine accuracies using average or any other metric
combined_accuracy = (accuracy1 + accuracy2) / 2

print(f"Accuracy for OHQ620: {accuracy1}")
print(classification_report(y1_test_noisy, y1_pred))

print(f"\nAccuracy for OHQ835: {accuracy2}")
print(classification_report(y2_test_noisy, y2_pred))

print(f"\nCombined Accuracy: {combined_accuracy}")


Accuracy for OHQ620: 0.8994488808907884
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       889
           1       0.90      1.00      0.95      8002

    accuracy                           0.90      8891
   macro avg       0.45      0.50      0.47      8891
weighted avg       0.81      0.90      0.85      8891


Accuracy for OHQ835: 0.9000112473287594
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       889
           1       0.90      1.00      0.95      8002

    accuracy                           0.90      8891
   macro avg       0.45      0.50      0.47      8891
weighted avg       0.81      0.90      0.85      8891


Combined Accuracy: 0.8997300641097739


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Define the file path for the existing dataset
file_path = r"C:\Users\nlpsw\Downloads\sample1.csv"

# Load the existing dataset
df = pd.read_csv(file_path)

# Extract features and target variables
features = ['OHQ030', 'OHQ555G', 'OHQ560Q', 'OHQ566', 'OHQ620', 'OHQ835',
            'OHQ848Q', 'OHQ849', 'OHQ850', 'OHQ860', 'OHQ870']
targets = ['OHQ620', 'OHQ835']

# Replace NaN values with 77 in features and targets
df[features] = df[features].fillna(77)
for target in targets:
    df[target] = df[target].fillna(77)

# Encode categorical variables using LabelEncoder
label_encoders = {}
for feature in features:
    if df[feature].dtype == 'object':
        label_encoders[feature] = LabelEncoder()
        df[feature] = label_encoders[feature].fit_transform(df[feature].astype(str))

# Split data into training and testing sets
X = df[features]
y1 = df['OHQ620']
y2 = df['OHQ835']
X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.5, random_state=42)

# Initialize SVM models for each target variable
model1 = SVC(kernel='rbf', random_state=42)
model2 = SVC(kernel='rbf', random_state=42)

# Train the models
model1.fit(X_train, y1_train)
model2.fit(X_train, y2_train)

# Predict on test data for each target variable
y1_pred = model1.predict(X_test)
y2_pred = model2.predict(X_test)

# Calculate accuracies for each target variable
accuracy1 = accuracy_score(y1_test, y1_pred)
accuracy2 = accuracy_score(y2_test, y2_pred)

# Combine accuracies using average or any other metric
combined_accuracy = (accuracy1 + accuracy2) / 2

print(f"Accuracy for OHQ620 (SVM): {accuracy1}")
print(classification_report(y1_test, y1_pred))

print(f"\nAccuracy for OHQ835 (SVM): {accuracy2}")
print(classification_report(y2_test, y2_pred))

print(f"\nCombined Accuracy (SVM): {combined_accuracy}")


Accuracy for OHQ620 (SVM): 0.8495107411989652
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00        85
         2.0       0.00      0.00      0.00       109
         3.0       0.00      0.00      0.00       435
         4.0       0.00      0.00      0.00       708
         5.0       0.44      1.00      0.61      1051
         9.0       0.00      0.00      0.00         1
        77.0       1.00      1.00      1.00      6502

    accuracy                           0.85      8891
   macro avg       0.21      0.29      0.23      8891
weighted avg       0.78      0.85      0.80      8891


Accuracy for OHQ835 (SVM): 0.9432009897649308
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       471
         2.0       0.79      1.00      0.88      1884
         9.0       0.00      0.00      0.00        34
        77.0       1.00      1.00      1.00      6502

    accuracy                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Define the file path for the existing dataset
file_path = r"C:\Users\nlpsw\Downloads\sample1.csv"

# Load the existing dataset
df = pd.read_csv(file_path)
total_records = len(df)
print(f'Total number of records: {total_records}')

Total number of records: 17782
