In [None]:
%conda install -U pandas numpy pyarrow tensorflow tensorflow_hub nltk matplotlib seaborn scikit-learn imbalanced-learn;

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('dataset.csv', usecols = ['par_id', 'paragraph', 'has_entity', 'lexicon_count', 'difficult_words', 'last_editor_gender', 'category'])

In [None]:
df = df.dropna()

In [None]:
# Convert the 'category' column to lowercase
df['category'] = df['category'].str.lower()

In [19]:
df = df[df['has_entity'] != 'data missing']

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
category_counts = df['category'].unique()
print(category_counts)

In [None]:
# Count the occurrences of each category
category_counts = df['category'].value_counts()

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Categories')
plt.show()

In [20]:
entity_count = df['has_entity'].unique()
print(entity_count)

['ORG_YES_PRODUCT_NO_PERSON_YES_' 'ORG_YES_PRODUCT_NO_PERSON_NO_'
 'ORG_NO_PRODUCT_YES_PERSON_NO_' 'ORG_YES_PRODUCT_YES_PERSON_YES_'
 'ORG_NO_PRODUCT_NO_PERSON_NO_' 'ORG_NO_PRODUCT_YES_PERSON_YES_'
 'ORG_NO_PRODUCT_NO_PERSON_YES_' 'ORG_YES_PRODUCT_YES_PERSON_NO_']


In [None]:
category_values = df.value_counts('category')
print(category_values)

In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define text preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

# Load data
df = pd.read_csv('dataset.csv')

# Preprocess text
df['paragraph'] = df['paragraph'].apply(preprocess_text)

# Define feature extraction and model training function
def train_model():
    # Feature extraction
    vectorizer = TfidfVectorizer()
    X_text = vectorizer.fit_transform(df['paragraph'])

    encoder = OneHotEncoder()
    X_entity = encoder.fit_transform(df[['has_entity']])

    X = np.hstack((X_text.toarray(), X_entity.toarray()))

    y = df['category']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Train the model
    model = MultiOutputClassifier(SVC())
    model.fit(X_train, y_train)

    return model

# Train the model
model = train_model()

# Evaluate the model
X_test = np.hstack((vectorizer.transform(df['paragraph']).toarray(), encoder.transform(df[['has_entity']]).toarray()))
y_pred = model.predict(X_test)
accuracy = accuracy_score(df['category'], y_pred)
print("Accuracy:", accuracy)
print(classification_report(df['category'], y_pred))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\muhammedazhar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muhammedazhar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\muhammedazhar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


ValueError: Input contains NaN