In [None]:
# Multi-class Random Forest Model

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import check_X_y
import time
from statistics import mean

In [None]:
%store -r dataset_pred_bc

In [None]:
dataset = dataset_pred_bc
dataset.head()

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)

dataset.groupby('cat1').clean_content.count().sort_values(ascending=True).plot.barh(ylim=0, color='#1f77b4', ax=ax)
#plt.title('Frequency of Ethical concerns')

plt.xlabel('Number of reviews', fontsize=14)  # Increase font size to 12
plt.ylabel('Ethical concerns', fontsize=14)  # Increase font size to 12

# Increase font size of tick labels on both axes
ax.tick_params(axis='x', labelsize=14)  # Increase x-axis tick label font size to 10
ax.tick_params(axis='y', labelsize=14)  # Increase y-axis tick label font size to 10

plt.show()

In [None]:
dataset.groupby('cat1').clean_content.count().sort_values(ascending=True).to_csv('all_concern.csv')

In [None]:
def clean_no_concern(dataset):
    df = dataset[pd.notnull(dataset['clean_content'])]
    df = df.query("cat1 not in ['Other', 'none', 'Noise']")
    # get a function here to define top
    df_count = df.groupby('cat1').clean_content.count().reset_index(name='counts')
    top_list = df_count[df_count['counts'] > 50]['cat1']
    top_list = top_list.to_list()
    dataset = dataset.query("cat1  in @top_list")
    print(dataset.groupby('cat1').clean_content.count())
    return dataset

In [None]:
dataset = clean_no_concern(dataset)

In [None]:
def factorize_concern(dataset):
    dataset['cat1_id'] = dataset['cat1'].factorize()[0]
    concern_id_df = dataset[['cat1', 'cat1_id']].drop_duplicates().sort_values('cat1_id')
    concern_to_id = dict(concern_id_df.values)
    id_to_concern = dict(concern_id_df[['cat1_id', 'cat1']].values)
    return dataset, concern_id_df, concern_to_id, id_to_concern

In [None]:
dataset, concern_id_df, concern_to_id, id_to_concern = factorize_concern(dataset)

In [None]:
concern_to_id

In [None]:
dataset

In [None]:
def vectorizer(dataset):
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
    features = tfidf.fit_transform(dataset.clean_content).toarray()
    labels = dataset.cat1_id
    print("features: ", features.shape)
    return  features, labels

In [None]:
features, labels = vectorizer(dataset)

In [None]:
def add_predict(df, y_pred, indices_test):
    for ind, pred in zip(indices_test, y_pred):
        df.at[ind, "predicted"] = id_to_concern[int(pred)]
    return df

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold, GridSearchCV, cross_val_score, cross_validate


modelRF = RandomForestClassifier(random_state=1,
                              bootstrap=False,
                              max_depth=150,
                              max_features='log2',
                              min_samples_leaf=1,
                              min_samples_split=0.0015,
#                               min_samples_split=0.475,
                              n_estimators=200)
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, dataset.index,
                                                                                     test_size=0.33, random_state=0)

modelRF.fit(X_train, y_train)
y_pred = modelRF.predict(X_test)
print("predicted dataset length: ", len(indices_test))

predict_dataset = add_predict(dataset, y_pred, indices_test)

    

In [None]:
predict_dataset['clean_content'] = predict_dataset.clean_content.apply(lambda x: x.split(' '))
predict_dataset['clean_content']

In [None]:
predict_dataset['predicted'].unique()
predict_dataset = predict_dataset[pd.notnull(predict_dataset['predicted'])]
predict_dataset['predicted'].unique()

In [None]:
wrong_predictions = predict_dataset[predict_dataset['predicted'] != predict_dataset['cat1']]
print(wrong_predictions[['cat1', 'predicted']])

In [None]:
len(predict_dataset['predicted'])

In [None]:
%store predict_dataset

In [None]:
predict_dataset['predicted'].unique()

In [None]:
df_count = predict_dataset.groupby(['predicted', 'app_name']).clean_content.count().sort_values(ascending=False).reset_index(name='counts')
df_count.to_csv('predicted_concern_per_app.csv')

In [None]:
df_count

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have a DataFrame named df_count with columns: predicted, app_name, and counts

# Group the DataFrame by app_name
grouped_df = df_count.groupby('app_name')

# Plotting
fig, ax = plt.subplots(figsize=(8, 6))  # Adjust the figsize as needed
colors = sns.color_palette("tab20", len(df_count['app_name'].unique()))


for i, (group, data) in enumerate(grouped_df):
    ax.barh(data['predicted'], data['counts'], color=colors[i], label=group)

# Customize the plot

plt.xlabel('Number of reviews', fontsize=14)  # Increase font size to 12
plt.ylabel('Predicted concerns', fontsize=14)  # Increase font size to 12

# Increase font size of tick labels on both axes
ax.tick_params(axis='x', labelsize=14)  # Increase x-axis tick label font size to 10
ax.tick_params(axis='y', labelsize=14)  # Increase y-axis tick label font size to 10
ax.legend()

plt.xticks()  # Rotate x-axis labels if needed

# Show the plot
plt.show()


In [None]:
# Group the DataFrame by app_name
grouped_df = df_count.groupby('app_name')

print(grouped_df)

In [None]:
# Get the categories and sort them in ascending order based on total counts
categories = df_count['predicted'].unique()
total_counts = df_count.groupby('predicted')['counts'].sum().reset_index()
sorted_counts = total_counts.sort_values('counts')
sorted_categories = sorted_counts['predicted']


In [None]:
predict_dataset.groupby(['predicted','app_name']).clean_content.count()