In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# Step 2.1: Text Cleaning
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

# Step 2.2: Tokenization
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Step 2.3: Stopword Removal
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [None]:
text_data = ["Hi anshika, I hate you. I cannot tolerate you."]

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
cleaned_text = [clean_text(text) for text in text_data]
tokenized_text = [tokenize_text(text) for text in cleaned_text]

In [None]:
filtered_text = [remove_stopwords(tokens) for tokens in tokenized_text]

In [None]:
print("Original Text:", text_data)
print("Cleaned Text:", cleaned_text)
print("Tokenized Text:", tokenized_text)
print("Text after Stopword Removal:", filtered_text)

In [None]:
import mysql.connector
try:
    db_connection = mysql.connector.connect(
        host="localhost",
        user="root",
        password="root",
        database="AI_hate"
    )
    print("Connected to the database successfully!")
except mysql.connector.Error as e:
    print("Error connecting to the database:", e)
    exit()

In [None]:
cursor = db_connection.cursor()

In [None]:
cursor.execute("SELECT * FROM ChatScreenshots")
rows = cursor.fetchall()

In [None]:
import random
random_rows = random.sample(rows, 25)
preprocessed_texts = []
hate_speech_labels = []
# Tokenize the text data for each tuple
for row in random_rows:
    text_data = row[1]  
    tokenized_text = word_tokenize(text_data)
    preprocessed_text = " ".join(tokenized_text)  # Convert tokenized text to string
    preprocessed_texts.append(preprocessed_text)
    hate_speech_labels.append(row[2])  # Assuming the label is in the second column of the table
    print("Original Text:", text_data)
    print("Tokenized Text:", tokenized_text)
    print()


In [None]:
if len(preprocessed_texts) != len(hate_speech_labels):
    print("Inconsistent number of preprocessed texts and labels.")
    exit()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_texts, hate_speech_labels, test_size=0.2, random_state=42)

In [None]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize logistic regression classifier
classifier = LogisticRegression()

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Predict on the testing data
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
pip install pytesseract

In [None]:
pip install Pillow

In [None]:
from PIL import Image
from textblob import TextBlob
import pytesseract
import re
import mysql.connector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Connect to the MySQL database
try:
    db_connection = mysql.connector.connect(
        host="localhost",
        user="root",
        password="root",
        database="AI_hate"
    )
    print("Connected to the database successfully!")
except mysql.connector.Error as e:
    print("Error connecting to the database:", e)
    exit()

# Step 2.1: Text Cleaning
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert text to lowercase
    text = text.lower()
    return text

# Step 2.2: Tokenization
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Step 2.3: Stopword Removal
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

def train_model(X_train, y_train):
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Initialize logistic regression classifier
    classifier = LogisticRegression()

    # Train the classifier
    classifier.fit(X_train_tfidf, y_train)
    
    return classifier, tfidf_vectorizer

def analyze_sentiment(text):
    analysis = TextBlob(text)
    sentiment_score = analysis.sentiment.polarity
    if any(word.isupper() for word in text.split()):
        sentiment_score -= 0.1  # Decrease sentiment score by 0.1 if capital words are present
    
    if sentiment_score > 0:
        return "Positive"
    elif sentiment_score < 0:
        return "Negative"
    else:
        return "Neutral"

def detect_hate_speech(text, classifier, hate_speech_phrases):
    # Clean and preprocess the input text
    cleaned_text = clean_text(text)
    tokenized_text = tokenize_text(cleaned_text)
    filtered_text = remove_stopwords(tokenized_text)
    
    # Convert the preprocessed text into TF-IDF vector
    text_tfidf = tfidf_vectorizer.transform([' '.join(filtered_text)])
    
    # Predict using the trained classifier
    prediction = classifier.predict(text_tfidf)
    
    # If the prediction is hate speech (1), return True
    if prediction[0] == 1:
        return True
    
    # Otherwise, continue with database lookup
    for phrase in hate_speech_phrases:
        if re.search(phrase, text, flags=re.IGNORECASE):
            return True
    
    # If no hate speech detected, return False
    return False

def ocr_and_detect(image_path, classifier, hate_speech_phrases):
    # Perform OCR using Tesseract
    detected_text = pytesseract.image_to_string(Image.open(image_path))

    # Detect hate speech
    if detect_hate_speech(detected_text, classifier, hate_speech_phrases):
        print("Hate speech detected in the image!")
        sentiment = analyze_sentiment(detected_text)
        print("Sentiment of the detected text:", sentiment)
    else:
        print("No hate speech detected in the image.")

    return detected_text;

# Example SQL query to fetch text data from a table
cursor = db_connection.cursor()
cursor.execute("SELECT * FROM ChatScreenshots")
rows = cursor.fetchall()

# Train model
texts = [row[1] for row in rows]
labels = [row[2] for row in rows]
classifier, tfidf_vectorizer = train_model(texts, labels)

# Get hate speech phrases from the database
cursor.execute("SELECT content FROM ChatScreenshots WHERE label = 'hate speech'")
hate_speech_phrases = [row[0] for row in cursor.fetchall()]

# Ask for image path
image_path = input("Enter the path to the image file: ").strip('"')

# Perform OCR and hate speech detection
st=ocr_and_detect(image_path, classifier, hate_speech_phrases)
print(st)