In [3]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
dataset = load_dataset("google/civil_comments")
df = dataset['train'].to_pandas()

In [5]:
df

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want yo...",0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1,Thank you!! This would make my life a lot less...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
2,This is such an urgent design problem; kudos t...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
3,Is this something I'll be able to install on m...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.000000,0.0,0.872340,0.021277,0.0
...,...,...,...,...,...,...,...,...
1804869,"Maybe the tax on ""things"" would be collected w...",0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1804870,What do you call people who STILL think the di...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1804871,"thank you ,,,right or wrong,,, i am following ...",0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
1804872,Anyone who is quoted as having the following e...,0.621212,0.030303,0.030303,0.0,0.621212,0.045455,0.0


In [8]:
labels = ['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']
X = df['text']
y = df[labels]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [11]:
vectorizer=TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [13]:
model = LinearRegression()
model.fit(X_train_tfidf, y_train)

In [15]:
y_pred = model.predict(X_test_tfidf)
print(f"Mean_squared_error:{mean_squared_error(y_test, y_pred)}")
print(f"R^2 score: {r2_score(y_test, y_pred)}")

Mean_squared_error:0.006664667643040905
R^2 score: 0.32224628254490634


In [16]:
def get_comment_rating(comment):
  comment_tfidf = vectorizer.transform([comment])
  prediction = model.predict(comment_tfidf)
  return prediction[0]

In [None]:
print(labels)

['toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit']


In [22]:
new_comment = "This is very nice"
print(new_comment)
print(f"Toxicity score:{get_comment_rating(new_comment)}")

This is very nice
Toxicity score:[0.04471473 0.00197949 0.0068019  0.0032513  0.03603179 0.00393528
 0.00238995]


In [23]:
new_comment = "This is terrible"
print(new_comment)
print(f"Toxicity score:{get_comment_rating(new_comment)}")

This is terrible
Toxicity score:[0.19805223 0.00650995 0.01932027 0.00403869 0.17463148 0.02664472
 0.00097068]
