In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from wordcloud import WordCloud

# Load the dataset from Kaggle's environment
# Kaggle datasets are usually accessed from /kaggle/input/[dataset-name] directory
df = pd.read_csv('/kaggle/input/ai-human/AI_Human.csv')

# Display the first few rows of the dataset
print(df.head())

# Count the number of 0s and 1s in the 'generated' column
counts = df['generated'].value_counts()
num_zeros = counts.get(0, 0)
num_ones = counts.get(1, 0)

print('------------------------------------------------')

# Display the statistics
print(f"Number of 0s in 'generated' column: {num_zeros}")
print(f"Number of 1s in 'generated' column: {num_ones}")

# Extract features and labels
comments = df['text'].astype(str).tolist()
labels = df['generated'].tolist()

print('------------------------------------------------')

# Bar plot for distribution of 0s and 1s in the 'generated' column
sns.countplot(x='generated', data=df)
plt.title('Distribution of Generated Labels')
plt.xlabel('Generated')
plt.ylabel('Count')
plt.show()

# Calculate the length of each text entry
df['text_length'] = df['text'].apply(len)

# Calculate the average length of the text column
average_length = df['text_length'].mean()

# Display the average length
print(f"Average length of text in the 'text' column: {average_length}")

# Suppress the specific FutureWarning
warnings.filterwarnings('ignore', category=FutureWarning, message="use_inf_as_na option is deprecated")

# Histogram of text lengths
plt.figure(figsize=(10, 6))
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

# Box plot of text lengths by generated label
plt.figure(figsize=(10, 6))
sns.boxplot(x='generated', y='text_length', data=df)
plt.title('Box Plot of Text Lengths by Generated Label')
plt.xlabel('Generated')
plt.ylabel('Text Length')
plt.show()

# Violin plot of text lengths by generated label
plt.figure(figsize=(10, 6))
sns.violinplot(x='generated', y='text_length', data=df)
plt.title('Violin Plot of Text Lengths by Generated Label')
plt.xlabel('Generated')
plt.ylabel('Text Length')
plt.show()

# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(comments)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train a simple model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# RANDOM FOREST CLASSIFIER
print("------------------------------------------------")
print("Random Forest Classifier")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=0))

# SUPPORT VECTOR MACHINE (SVM)
print("------------------------------------------------")
print("Support Vector Machine (SVM)")
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm, zero_division=0))

# DECISION TREE CLASSIFIER
print("------------------------------------------------")
print("Decision Tree Classifier")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred_dt, zero_division=0))

# LINEAR REGRESSION (For regression task, if 'generated' was numeric)
# Note: Since `generated` is binary, we can use regression to explore probabilities but not as a primary classification method.
print("------------------------------------------------")
print("Linear Regression")
lin_reg_model = Ridge(alpha=1.0, random_state=42)
lin_reg_model.fit(X_train, y_train)
y_pred_lr = lin_reg_model.predict(X_test)
lr_mse = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression Mean Squared Error: {lr_mse}")

# Feature Importance from Random Forest (if applicable)
if hasattr(rf_model, 'feature_importances_'):
    feature_importance = rf_model.feature_importances_
    feature_names = vectorizer.get_feature_names_out()
    sorted_idx = np.argsort(feature_importance)[-10:]  # Top 10 features
    print("\nTop 10 Important Features (Random Forest):")
    for idx in sorted_idx[::-1]:
        print(f"{feature_names[idx]}: {feature_importance[idx]:.4f}")

# Plotting Feature Importance (optional)
plt.figure(figsize=(10, 6))
plt.barh([feature_names[idx] for idx in sorted_idx], feature_importance[sorted_idx])
plt.title("Top 10 Features by Importance (Random Forest)")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()