<a href="https://colab.research.google.com/github/nagadco/hud/blob/main/RandomForest_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re

def contains_arabic_or_english_or_digits(text):
    pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\u1EE00-\u1EEFFa-zA-Z0-9]')
    return bool(pattern.search(text))

def preprocess_data(filepath):
    schema = {
        'Name': str,
        'Phone_Number': str,
        'Category': str,
        'City': str,
        'Website': str,
        'Rating': float,
        'Working_Hours': str,
        'Media': str,
        'Longitude': float,
        'Latitude': float,
        'District': str,
        'Logo': str,
        'Price_Range': str,
        'Status': str,
        'Full_Address': str,
        'Order_Link': str,
        'Reservation_Link': str,
        'Description': str,
        'Label': str
    }

    data = pd.read_csv(filepath, dtype=schema, encoding='utf-8')

    # Convert all columns except 'Name', 'Category', and 'Label' to binary based on presence
    for column in data.columns:
        if column not in ['Name', 'Category', 'Label']:
            data[column] = data[column].apply(lambda x: 0 if pd.isnull(x) or x == 'Missing' else 1)

    # Convert 'Label' from 'Good'/'Bad' to 1/0
    data['Label'] = data['Label'].map({'Good': 1, 'Bad': 0})

    # Update 'Rating' to be 1 if > 2, otherwise 0
    data['Rating'] = data['Rating'].apply(lambda x: 1 if x > 2 else 0)

    # Set all binary features to 0 if 'Name' does not contain Arabic, English, or digits
    def check_and_update_row(row):
        if not contains_arabic_or_english_or_digits(row['Name']):
            for column in row.index:
                if column not in ['Name', 'Category', 'Label']:
                    row[column] = 0
        return row

    data = data.apply(check_and_update_row, axis=1)

    return data

# Example usage
m = preprocess_data("/content/combined.csv")
m


FileNotFoundError: [Errno 2] No such file or directory: '/content/combined.csv'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler



def train_and_predict(filepath):
    # Check if the file exists before proceeding
    try:
        data = preprocess_data(filepath)
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return None # Return None if file not found

    features = data.drop(['Label', 'Name', 'Category'], axis=1)
    labels = data['Label']

    # Balancing the dataset using RandomOverSampler
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(features, labels)

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

    model = RandomForestClassifier(n_estimators=40, random_state=42)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    print(f"Model Accuracy: {accuracy:.2%}")
    print(classification_report(y_test, predictions))

    cm = confusion_matrix(y_test, predictions)
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title('Confusion Matrix')
    plt.ylabel('Actual Labels')
    plt.xlabel('Predicted Labels')
    plt.show()

    TN, FP, FN, TP = confusion_matrix(y_test, predictions).ravel()
    print(f"True Positives (TP): {TP}")
    print(f"True Negatives (TN): {TN}")
    print(f"False Positives (FP): {FP}")
    print(f"False Negatives (FN): {FN}")

    return model

# Example usage
model = train_and_predict('TriningData.csv')

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
filepath = '/content/mapp-Al-rawabi_test.csv'
df = pd.read_csv(filepath)

# Add the 'Label' column with a default value (e.g., all zeros or based on a condition)
df['Label'] = 0  # Add a default value of 0 for all rows

df.to_csv(filepath, index=False)

print("Label column added successfully.")

In [None]:
def predict_new_data(model, new_data_filepath):
    new_data = preprocess_data(new_data_filepath)
    new_features = new_data.drop(['Label', 'Name', 'Category'], axis=1)  # Ensure 'Category' is also dropped
    new_predictions = model.predict(new_features)

    # Create a DataFrame to display names and their predicted labels
    results_df = pd.DataFrame({
        'Name': new_data['Name'],
        'Predicted Label': new_predictions
    })
    return results_df


new_predictions_df = predict_new_data(model, '/content/mapp-Al-rawabi_test.csv')  # Predict new data
new_predictions_df.head()  # Display first few rows of the results


NameError: name 'model' is not defined

In [None]:
new_predictions_df.to_csv('/content/your_predictions00.csv', index=False)


NameError: name 'new_predictions_df' is not defined

In [None]:
import os
print(os.path.abspath('/content/your_predictions.csv'))


/content/your_predictions.csv
