In [37]:
import pandas as pd
df = pd.read_csv("train_snli.csv")

In [38]:
# Manually inspect the first few raw lines of the dataset
with open('train_snli.csv', 'r', encoding='utf-8') as file:
    raw_lines = [next(file) for _ in range(367372
                                           )]

# Display the raw lines
raw_lines


['"A person on a horse jumps over a broken down airplane.\tA person is at a diner, ordering an omelette.\t0"\n',
 '"A person on a horse jumps over a broken down airplane.\tA person is outdoors, on a horse.\t1"\n',
 '"Children smiling and waving at camera\tThere are children present\t1"\n',
 '"Children smiling and waving at camera\tThe kids are frowning\t0"\n',
 '"A boy is jumping on skateboard in the middle of a red bridge.\tThe boy skates down the sidewalk.\t0"\n',
 '"A boy is jumping on skateboard in the middle of a red bridge.\tThe boy does a skateboarding trick.\t1"\n',
 '"An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.\tA boy flips a burger.\t0"\n',
 '"Two blond women are hugging one another.\tThe women are sleeping.\t0"\n',
 '"Two blond women are hugging one another.\tThere are women showing affection.\t1"\n',
 '"A few people in a restaurant setting, one of them is drinking orange juice.\t

In [39]:
# Manually split the single column into multiple columns using tab ('\t') separator
df_fixed = df.iloc[:, 0].str.split("\t", expand=True)

# Assign correct column names based on expected dataset structure
df_fixed.columns = ["Premise", "Hypothesis", "Label"]

# Display the first few rows to confirm the fix
df_fixed.head()

Unnamed: 0,Premise,Hypothesis,Label
0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
1,Children smiling and waving at camera,There are children present,1
2,Children smiling and waving at camera,The kids are frowning,0
3,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0
4,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1


In [40]:
# Clean the data by removing extra quotation marks and stripping spaces
cleaned_lines = [line.replace('"', '').strip() for line in raw_lines]

# Split each line into columns using tab as the separator
processed_data = [line.split("\t") for line in cleaned_lines]

# Convert to a DataFrame with correct column names
df_cleaned = pd.DataFrame(processed_data, columns=["Premise", "Hypothesis", "Label"])

# Display the first few rows to confirm proper formatting
df_cleaned.head(10)


Unnamed: 0,Premise,Hypothesis,Label
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0
5,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1
6,An older man sits with his orange juice at a s...,A boy flips a burger.,0
7,Two blond women are hugging one another.,The women are sleeping.,0
8,Two blond women are hugging one another.,There are women showing affection.,1
9,"A few people in a restaurant setting, one of t...",The people are sitting at desks in school.,0


In [None]:
# Convert the 'Label' column to integers
df_cleaned["Label"] = df_cleaned["Label"].astype(int)

# Define plagiarism detection labels
def label_to_plagiarism(label):
    return "Plagiarism Detected" if label in [0, 2] else "No Plagiarism Detected"

# Apply the function to create a new column
df_cleaned["Plagiarism_Status"] = df_cleaned["Label"].apply(label_to_plagiarism)

# Display the updated dataset
df_cleaned.head(10)


Unnamed: 0,Premise,Hypothesis,Label,Plagiarism_Status
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0,Plagiarism Detected
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1,No Plagiarism Detected
2,Children smiling and waving at camera,There are children present,1,No Plagiarism Detected
3,Children smiling and waving at camera,The kids are frowning,0,Plagiarism Detected
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0,Plagiarism Detected
5,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1,No Plagiarism Detected
6,An older man sits with his orange juice at a s...,A boy flips a burger.,0,Plagiarism Detected
7,Two blond women are hugging one another.,The women are sleeping.,0,Plagiarism Detected
8,Two blond women are hugging one another.,There are women showing affection.,1,No Plagiarism Detected
9,"A few people in a restaurant setting, one of t...",The people are sitting at desks in school.,0,Plagiarism Detected


In [None]:
# Import necessary libraries for text processing and model training
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned[["Premise", "Hypothesis"]], df_cleaned["Plagiarism_Status"],
    test_size=0.2, random_state=42)

# Combine "Premise" and "Hypothesis" for similarity-based learning
X_train_combined = X_train["Premise"] + " " + X_train["Hypothesis"]
X_test_combined = X_test["Premise"] + " " + X_test["Hypothesis"]

# Build a text classification pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),  # Convert text to numerical vectors
    ("classifier", LogisticRegression())  # Train logistic regression model
])

# Train the model
pipeline.fit(X_train_combined, y_train)

# Make predictions
y_pred = pipeline.predict(X_test_combined)

# Evaluate model accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy
print(f"Model Accuracy: {accuracy:.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Accuracy: 0.70


In [None]:
print(f"Final Logistic Regression Accuracy: {accuracy:.2f}")


Final Logistic Regression Accuracy: 0.70


In [None]:
import joblib

# Save the model
joblib.dump(pipeline, "plagiarism_detector.pkl")

print("✅ Model saved as plagiarism_detector.pkl")


✅ Model saved as plagiarism_detector.pkl


In [41]:
# Load the saved model
model = joblib.load("plagiarism_detector.pkl")

# Example test case
test_premise = "A person is riding a horse."
test_hypothesis = "An individual is on an animal."

# Combine and predict
test_input = [test_premise + " " + test_hypothesis]
prediction = model.predict(test_input)[0]

print(f"🔍 Model Prediction: {'Plagiarism Detected' if prediction == 1 else 'No Plagiarism Detected'}")


🔍 Model Prediction: No Plagiarism Detected
