<a href="https://colab.research.google.com/github/loki20051267/NLP/blob/main/aug14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://files.consumerfinance.gov/ccdb/complaints.csv.zip -O complaints.csv.zip
!unzip complaints.csv.zip
import pandas as pd
df = pd.read_csv('complaints.csv', usecols=['complaint_what_happened'])
df = df.rename(columns={'complaint_what_happened': 'complaint_text'})


In [None]:
sample = df['complaint_text'].dropna().head(25)
for idx, text in enumerate(sample, 1):
    num_chars = len(text)
    num_words = len(text.split())
    print(f"{idx:2d}. ({num_chars} chars, {num_words} words): {text}")


In [None]:
import spacy
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter

nlp = spacy.load("en_core_web_sm")

all_tags = Counter()
for text in sample:
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    all_tags.update(tag for word, tag in tags)

print(all_tags.most_common())


In [None]:
import matplotlib.pyplot as plt

verbs = Counter()
adjectives = Counter()

for doc in nlp.pipe(sample, disable=["ner", "parser"]):
    for token in doc:
        if token.pos_ == "VERB":
            verbs[token.lemma_.lower()] += 1
        elif token.pos_ == "ADJ":
            adjectives[token.lemma_.lower()] += 1

# Plotting
def plot_counter(counter, title):
    common = counter.most_common(10)
    words, counts = zip(*common)
    plt.figure(figsize=(8,5))
    plt.bar(words, counts)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

plot_counter(verbs, "Top 10 Verbs")
plot_counter(adjectives, "Top 10 Adjectives")


In [None]:
import re

samples = [
    "My phone number is 1234567890 and my email is test@domain.com",
    "Visit https://example.com for more info!!!",
    "HELLO!!! This is SOOOOO exciting :))",
    "Contact us at info@company.org or call +91 98765-43210",
    "Python's regex is very useful!!!  #Coding #Fun"
]

phone_pattern = re.compile(r'(\+?\d[\d\-\s]{7,}\d)')
email_pattern = re.compile(r'\S+@\S+')
url_pattern = re.compile(r'https?://\S+')

for text in samples:
    phones = phone_pattern.findall(text)
    print("Phones found:", phones)
    clean = url_pattern.sub('', text)
    clean = email_pattern.sub('', clean)
    clean = phone_pattern.sub('', clean)
    clean = re.sub(r'[^A-Za-z0-9\s]+', '', clean)
    clean = re.sub(r'\s+', ' ', clean).strip()
    print("Cleaned:", clean, "\n")
