In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd

# Load the training and test data
train_df = pd.read_csv(r"C:\Users\naimu\Downloads\Nitro NLP\train.csv\train.csv")
test_df = pd.read_csv(r"C:\Users\naimu\Downloads\Nitro NLP\test.csv\test.csv")

# Display column names
print("Column names:", train_df.columns.tolist())

# Ensure the presence of the target variable column
if 'class' not in train_df.columns:
    raise ValueError("Target variable column 'class' not found in the dataset.")

# Display the header
print("\nHeader of the dataset:")
print(train_df.head())

# Display the first few rows of the dataset
print("\nSample of the dataset:")
print(train_df.sample(5))  # Adjust the number of rows as needed

# Preprocessing for Logistic Regression: TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_lr = vectorizer.fit_transform(train_df['content'].fillna(''))
y_lr = train_df['class']

# Preprocessing for Feedforward Neural Network: Tokenization, sequence padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['content'].fillna(''))
X_nn = tokenizer.texts_to_sequences(train_df['content'].fillna(''))
X_nn = pad_sequences(X_nn)

# Split the data into training and validation sets for both models
X_train_lr, X_val_lr, y_train_lr, y_val_lr = train_test_split(X_lr, y_lr, test_size=0.2, random_state=42)
X_train_nn, X_val_nn, y_train_nn, y_val_nn = train_test_split(X_nn, y_lr, test_size=0.2, random_state=42)

# Train a Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_lr, y_train_lr)

# Train a Feedforward Neural Network model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_nn, y_train_nn, epochs=5, batch_size=32, validation_data=(X_val_nn, y_val_nn))

# Make predictions for Logistic Regression model
X_test_lr = vectorizer.transform(test_df['content'].fillna(''))
y_pred_lr = lr_model.predict(X_test_lr)

# Make predictions for Feedforward Neural Network model
X_test_nn = tokenizer.texts_to_sequences(test_df['content'].fillna(''))
X_test_nn = pad_sequences(X_test_nn, maxlen=X_nn.shape[1])
y_pred_nn = (model.predict(X_test_nn) > 0.5).astype(int)

# Create submission DataFrames
submission_lr = pd.DataFrame({'id': test_df['id'], 'class': y_pred_lr})
submission_nn = pd.DataFrame({'id': test_df['id'], 'class': y_pred_nn.flatten()})

# Save submission CSV files
submission_lr.to_csv(r'C:\Users\naimu\Downloads\Nitro NLP\logistic_regression_submission.csv', index=False)
submission_nn.to_csv(r'C:\Users\naimu\Downloads\Nitro NLP\feedforward_nn_submission.csv', index=False)


Column names: ['id', 'title', 'content', 'class']

Header of the dataset:
   id                                              title  \
0   0                                      PSD în alertă   
1   1  În amintirea Vioricăi, milioane de români beau...   
2   2  Dramă! Când credea că nu se poate mai rău, un ...   
3   3  Spania - România, 5-0. „Tricolorii”, îngenunch...   
4   4              Campanie electorală, veselie generală   

                                             content  class  
0  Prăbușirea PSD de la altitudinea sigură a celo...   True  
1  Moțiunea de cenzură care a doborât guvernul Dă...   True  
2  Credeai că ai ajuns la fundul sacului? Înseamn...   True  
3  Echipa națională a României a fost umilită, lu...  False  
4  Toate cresc în campania electorală, cît n-au c...   True  

Sample of the dataset:
          id                                              title  \
25990  25990  Covid-ul a făcut astăzi 4 milioane de victime:...   
41326  41326  BNR menține rata dobâ