## Step 1 : Text Cleaning and Preprocessing
#### The below lines of code are used for text cleaning in NLP.
- It will convert the text into lower case.
- And it will deleted all the punctuation marks in the text.
- It will also delete all the numbers and white spaces.
- It will output simple text in lower case.

In [1]:
import re
import string
from nltk.corpus import stopwords

def clean_text(text):
    #convert to lower case
    text = text.lower()
    #remove punctuation
    text = text.translate(str.maketrans('','', string.punctuation)) # two single quotes 
    #Remove numbers
    text = re.sub(r'\d+','', text)
    #Remove extra whitespace
    text = ' '.join(text.split())
    return text
clean_text("Machine Learning is Fun! <3 ")

'machine learning is fun'

## Step 2 : Tokenization and Stop Word Removal
#### The below code will tokenize and remove the stop words from the text.

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def tokenize_text(text):
    #Tokenize
    tokens = word_tokenize(text)
    #Remove Stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens
tokenize_text("The quick brown fox jumps over the lazy dog")

['The', 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']

In [3]:
tokenize_text(clean_text(("Machine learning is fun!")))

['machine', 'learning', 'fun']

## Step 3 : Feature Extraction - TF-IDF

In [None]:
!pip install --upgrade scikit-learn

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

documents = [
    "I love programming in Python.",
    "Python is great for data science.",
    "I enjoy learning new Python libraries.",
    "Machine learning is fun!"
]


#create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features = 1000, ngram_range=(1,2), min_df = 2, max_df = 0.8)
#Fit and Transform
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

In [11]:
tfidf_matrix

<4x3 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [12]:
feature_names

array(['is', 'learning', 'python'], dtype=object)

In [13]:
# Convert to pandas DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Show the table
df_tfidf

Unnamed: 0,is,learning,python
0,0.0,0.0,1.0
1,0.777221,0.0,0.629228
2,0.0,0.777221,0.629228
3,0.707107,0.707107,0.0


## Step 4 Advanced Text Features

In [17]:
def extract_text_features(text):
    features = {}
    # Basic statistics 
    features['char_count'] = len(text)
    features['word_count'] = len(text.split())
    features['sentence_count'] = text.count('.') + text.count('!') + text.count('?')
    #Advanced Features
    features['avg_word_length'] = sum(len(word) for word in text.split()) / len(text.split())
    features['unique_words'] = len(set(text.split()))
    features['lexical_diversity'] = features['unique_words'] / features['word_count']
    return features

In [18]:
extract_text_features("Machine learning is fun.")

{'char_count': 24,
 'word_count': 4,
 'sentence_count': 1,
 'avg_word_length': 5.25,
 'unique_words': 4,
 'lexical_diversity': 1.0}

## Both the two cells are have same code but a little different syntax

In [9]:
#tokenization and removing the stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def Tokenize_Text(text):
    #tokenizing the data
    tokens = word_tokenize(text)
    #setting the stop words to english stop words
    stop_words = set(stopwords.words('english'))
    new_tokens =[]
    for token in tokens:
        if token not in stop_words:
            new_tokens.append(token)
    return new_tokens
Tokenize_Text("My name is Maham.")

['My', 'name', 'Maham', '.']

In [5]:
#tokenization and removing the stop words
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
def Tokenize_Text(text):
    #tokenizing the data
    tokens = word_tokenize(text)
    #setting the stop words to english stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens
Tokenize_Text("My name is Maham.")

['My', 'name', 'Maham', '.']

In [10]:
import pandas as pd
def extract_features(text):
    features ={}
    words = text.split()
    features['char_count'] = len(text)
    features['word_count'] = len(words)
    features['sentence_count'] =text.count('.') + text.count('!') + text.count('?')
    features['exclamation_count'] = text.count('!')
    features['question_mark_count'] = text.count('?')
    features['hashtag_count'] = text.count('#')
    features['mentions_count'] = text.count('@')
    features['avg_word_length'] = sum(len(word) for word in words) / len(words) if len(words) > 0 else 0
    return features
feature = extract_features("My name is Maham.")
df = pd.DataFrame(feature, index =[1])
df
#data['features']= data['text'].apply(extract_features)
#data = pd.concat([data,features], axis = 1)
#data[['text', 'features']]

Unnamed: 0,char_count,word_count,sentence_count,exclamation_count,question_mark_count,hashtag_count,mentions_count,avg_word_length
1,17,4,1,0,0,0,0,3.5
