In [2]:
#preprocess step

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# DATA
df = pd.read_csv(r"C:\Users\nh013\Desktop\chatboot Q and A dataset\AI.csv")

# DROP MISSING VALUES
df.dropna(inplace=True)

# REMOVE ANY SPESIAL CHARECTER AND CONVERT TEXT TO LOWER CASE

df['Question'] = df['Question'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# REMOVE STOP WORDS

stop_words = set(stopwords.words('english'))
df['Question'] = df['Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# LEMMATIZATION

lemmatizer = WordNetLemmatizer()
df['Question'] = df['Question'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

print(df.head())


                                            Question  \
0                 first work generally recognized ai   
1  source drawn formation first work generally re...   
2                      created hebbian learning rule   
3                         first neural network built   
4                        first neural network called   

                                              Answer  
0        Warren McCulloch and Walter Pitts (1943).\n  
1  knowledge of the basic physiology and function...  
2                              Donald Hebb (1949).\n  
3                                            1950.\n  
4                                       The SNARC.\n  


In [5]:
# USING TEXT VECTORIZATION
# TF-IDF (Term Frequency-Inverse Document Frequency)

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# DATA
df = pd.read_csv(r"C:\Users\nh013\Desktop\chatboot Q and A dataset\AI.csv")

# DROP MISSING VALUE
df.dropna(inplace=True)

# REMOVE ANY SPECIAL CHERECTER AND CONVERT LOWERCASE FORM
df['Question'] = df['Question'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# REMOVE STOPWORDS
stop_words = set(stopwords.words('english'))
df['Question'] = df['Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# LEMMATIZATION
lemmatizer = WordNetLemmatizer()
df['Question'] = df['Question'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# PERFORM TEXT VECTORIZATION
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Question'])

# Convert the vectorized data to a DataFrame
vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# CONCATINATE THE VECTORIZED DATA WITH THE ORGINAL DATA FRAME
df = pd.concat([df, vectorized_df], axis=1)


print(df.head())


                                            Question  \
0                 first work generally recognized ai   
1  source drawn formation first work generally re...   
2                      created hebbian learning rule   
3                         first neural network built   
4                        first neural network called   

                                              Answer  1963  1965  1969  1975  \
0        Warren McCulloch and Walter Pitts (1943).\n   0.0   0.0   0.0   0.0   
1  knowledge of the basic physiology and function...   0.0   0.0   0.0   0.0   
2                              Donald Hebb (1949).\n   0.0   0.0   0.0   0.0   
3                                            1950.\n   0.0   0.0   0.0   0.0   
4                                       The SNARC.\n   0.0   0.0   0.0   0.0   

   1981  1988  1990s  1997  ...  within  word      work  world  worry  would  \
0   0.0   0.0    0.0   0.0  ...     0.0   0.0  0.438322    0.0    0.0    0.0   
1   0.0   0.0    0.0  

In [1]:
#To improve the quality of the input data in terms of handling typos, normalizing abbreviations, and addressing common 
#variations in user input
#update preprocessing..........

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from spellchecker import SpellChecker

#DATA
df = pd.read_csv(r"C:\Users\nh013\Desktop\chatboot Q and A dataset\AI.csv")


# DROP ROWS WITH MISSING VALUES
df.dropna(inplace=True)

# REMOVE ANY SPECIAL CHERECTER AND CONVERT LOWERCASE FORM
df['Question'] = df['Question'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# REMOVE STOPWORDS
stop_words = set(stopwords.words('english'))
df['Question'] = df['Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# LEMMATIZATION
lemmatizer = WordNetLemmatizer()
df['Question'] = df['Question'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# SPELL-CHECKING
spell = SpellChecker()
df['Question'] = df['Question'].apply(lambda x: ' '.join([spell.correction(word) if spell.correction(word) is not None else word for word in x.split()]))

# ABBREVIATION(Example: 'can't' to 'cannot')
abbreviation_mapping = {
    "can't": "cannot",
    "won't": "will not",
   
}
df['Question'] = df['Question'].apply(lambda x: ' '.join([abbreviation_mapping.get(word, word) for word in x.split()]))

# REMOVE ANY ROWS WITH MISSING OR EMTY  VALUES AFTER PROCESSING
df.dropna(subset=['Question'], inplace=True)


print(df.head())


                                            Question  \
0                 first work generally recognized ai   
1  source drawn formation first work generally re...   
2                      created lesbian learning rule   
3                         first neural network built   
4                        first neural network called   

                                              Answer  
0        Warren McCulloch and Walter Pitts (1943).\n  
1  knowledge of the basic physiology and function...  
2                              Donald Hebb (1949).\n  
3                                            1950.\n  
4                                       The SNARC.\n  


In [2]:
# UPDATE TEXT VECTORIZATION......

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import TfidfVectorizer
from spellchecker import SpellChecker

# DATA
df = pd.read_csv(r"C:\Users\nh013\Desktop\chatboot Q and A dataset\AI.csv")

# DROP ROWS WITH MISSING VALUES
df.dropna(inplace=True)

# REMOVE ANY SPECIAL CHERECTER AND CONVERT LOWERCASE FORM
df['Question'] = df['Question'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# REMOVE STOPWORDS
stop_words = set(stopwords.words('english'))
df['Question'] = df['Question'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# LEMMATIZATIONS
lemmatizer = WordNetLemmatizer()
df['Question'] = df['Question'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

# SPELL CHECKING
spell = SpellChecker()
df['Question'] = df['Question'].apply(lambda x: ' '.join([spell.correction(word) if spell.correction(word) is not None else word for word in x.split()]))

# ABBREVIATION(Example: 'can't' to 'cannot')
abbreviation_mapping = {
    "can't": "cannot",
    "won't": "will not",
   
}
df['Question'] = df['Question'].apply(lambda x: ' '.join([abbreviation_mapping.get(word, word) for word in x.split()]))

# REMOVE ANY ROWS WITH MISSING OR EMTY  VALUES AFTER PROCESSING
df.dropna(subset=['Question'], inplace=True)

# PERFORM TEXT VECTORIZATION
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Question'])

# Convert the vectorized data to a DataFrame
vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# CONCATINATE THE VECTORIZED DATA WITH THE ORGINAL DATA FRAME
df = pd.concat([df, vectorized_df], axis=1)


print(df.head())

                                            Question  \
0                 first work generally recognized ai   
1  source drawn formation first work generally re...   
2                      created lesbian learning rule   
3                         first neural network built   
4                        first neural network called   

                                              Answer  1963  1965  1969  1975  \
0        Warren McCulloch and Walter Pitts (1943).\n   0.0   0.0   0.0   0.0   
1  knowledge of the basic physiology and function...   0.0   0.0   0.0   0.0   
2                              Donald Hebb (1949).\n   0.0   0.0   0.0   0.0   
3                                            1950.\n   0.0   0.0   0.0   0.0   
4                                       The SNARC.\n   0.0   0.0   0.0   0.0   

   1981  1988  1990s  1997  ...  winter  within  word      work  world  worry  \
0   0.0   0.0    0.0   0.0  ...     0.0     0.0   0.0  0.438322    0.0    0.0   
1   0.0   0.0    0.0