# Simple Text Analysis
This notebook demonstrates basic text analysis steps.

In [None]:
# 1. Import required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# 2. Load sample data
text_data = [
    "I love this product! It's amazing and works great.",
    "This product is terrible. Don't buy it.",
    "Pretty good product, but a bit expensive.",
    "The customer service was excellent!!"
]
df = pd.DataFrame(text_data, columns=['text'])
df.head()

In [None]:
# 3. Text Cleaning
def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['cleaned_text'] = df['text'].apply(clean_text)
df[['text', 'cleaned_text']].head()

In [None]:
# 4. Tokenization
df['tokens'] = df['cleaned_text'].apply(word_tokenize)

# 5. Remove Stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df['tokens_no_stop'] = df['tokens'].apply(remove_stopwords)

# 6. Stemming
stemmer = PorterStemmer()

def stem_words(tokens):
    return [stemmer.stem(word) for word in tokens]

df['tokens_stemmed'] = df['tokens_no_stop'].apply(stem_words)
df[['text', 'tokens_stemmed']].head()

In [None]:
# 7. Word Frequency Analysis
def get_word_freq(tokens_list):
    all_words = [word for tokens in tokens_list for word in tokens]
    return Counter(all_words)

word_freq = get_word_freq(df['tokens_stemmed'])

# Plot top 10 most common words
plt.figure(figsize=(10, 6))
word_freq_df = pd.DataFrame.from_dict(word_freq, orient='index', columns=['count'])
word_freq_df = word_freq_df.sort_values('count', ascending=False).head(10)
sns.barplot(x=word_freq_df.index, y='count', data=word_freq_df)
plt.xticks(rotation=45)
plt.title('Top 10 Most Common Words')
plt.tight_layout()
plt.show()

In [None]:
# 8. Simple Sentiment Analysis
positive_words = ['love', 'great', 'excellent', 'good', 'amazing']
negative_words = ['terrible', 'bad', 'poor', 'horrible']

def simple_sentiment(tokens):
    tokens = [word.lower() for word in tokens]
    positive_count = sum(1 for word in tokens if word in positive_words)
    negative_count = sum(1 for word in tokens if word in negative_words)
    
    if positive_count > negative_count:
        return 'positive'
    elif negative_count > positive_count:
        return 'negative'
    else:
        return 'neutral'

df['sentiment'] = df['tokens'].apply(simple_sentiment)

# Display results
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

# Plot sentiment distribution
plt.figure(figsize=(8, 6))
df['sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.tight_layout()
plt.show()