In [2]:


import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


def load_jsonl_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

train_data = load_jsonl_data('ds\en_train.jsonl')
dev_data = load_jsonl_data('ds\en_dev.jsonl')

X_train = train_data['text']
y_train = train_data['label']
X_dev = dev_data['text']
y_dev = dev_data['label']

vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev_tfidf = vectorizer.transform(X_dev)

classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_dev_tfidf)

accuracy = accuracy_score(y_dev, y_pred)
report = classification_report(y_dev, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.7938515728268095
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.68      0.71     98328
           1       0.82      0.86      0.84    163430

    accuracy                           0.79    261758
   macro avg       0.78      0.77      0.78    261758
weighted avg       0.79      0.79      0.79    261758



In [3]:
import json


y_dev_pred = classifier.predict(X_dev_tfidf)

predictions = []
for idx, label in zip(dev_data['id'], y_dev_pred):
    predictions.append({"id": idx, "label": int(label)})

output_file = 'predictions.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
    for entry in predictions:
        f.write(json.dumps(entry) + '\n')

print(f"Predictions saved to {output_file}")
#in my case 1 is machine-generated. & 0 is Human gen.

Predictions saved to predictions.jsonl
