In [19]:
# Import necessary libraries and modules
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import joblib
from imblearn.pipeline import Pipeline as ImbPipeline

In [20]:
# Read in the data
df = pd.read_csv(r'C:\Users\bhara\Downloads\Hotel_sentiment _project\google_reviews3.csv')

In [21]:
# Fill or drop null values
df['text'].fillna('', inplace=True)  # Replace 'your_column' with the actual column name

In [22]:
# Create binary labels
df['label'] = df['rating'].apply(lambda x: 1 if x >= 4 else 0)

In [23]:
# Define additional stop words
additional_stop_words = [
    "will", "always", "go", "one", "very", "good", "only", "mr", "lot", "two",
    "th", "etc", "don", "due", "didn", "since", "nt", "ms", "ok", "almost",
    "put", "pm", "hyatt", "grand", "till", "add", "let", "hotel", "able",
    "per", "st", "couldn", "yet", "par", "hi", "well", "would", "I", "the",
    "s", "also", "great", "get", "like", "take", "thank"
]

In [24]:
# Combine standard English stop words with additional stop words
stop_words_keywords = set(stopwords.words('english')).union(additional_stop_words)

In [25]:
# Features and Labels
X = df['text']
y = df['label']

In [26]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [27]:
# Train the model using the pipeline
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)

# Print the results
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 88.80%


In [28]:
# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stop_words=None):
        # Constructor method to initialize the instance
        # stop_words: set of stop words, default to an empty set if not provided
        self.stop_words = stop_words or set()

    def fit(self, X, y=None):
        # No operation during fitting, returns self
        return self

    def transform(self, X):
        # Apply the clean_text method to each element in the input X
        return X.apply(self.clean_text)

    def clean_text(self, text):
        # Helper method for custom text cleaning and preprocessing steps

        # Check if the input is not a string
        if not isinstance(text, str):
            return ''

        # Custom text cleaning and preprocessing steps
        text = re.sub(r'\d+', '', text)  # Remove digits
        text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
        text = text.lower()  # Convert to lowercase
        words = word_tokenize(text)  # Tokenize the text
        words = [word for word in words if word not in self.stop_words]  # Remove stop words
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize words
        cleaned_text = ' '.join(words)  # Join cleaned words back into a string

        return cleaned_text


In [29]:
# Create an imblearn pipeline
pipeline = ImbPipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),  # TF-IDF Vectorization
    ('smote', SMOTE(random_state=42)),  # SMOTE for oversampling
    ('classifier', LogisticRegression(C=100, max_iter=1000, penalty='l2', solver='saga', fit_intercept=False))  # Logistic Regression
])

# Train the model using the pipeline
pipeline.fit(df['text'],df['label'])

In [30]:
# Test the model on new phrases
new_phrases = ['I love this product!', 'This is terrible. I hate it.']
predicted_sentiments = pipeline.predict(pd.Series(new_phrases))

# Display the results
for phrase, sentiment in zip(new_phrases, predicted_sentiments):
    print(f'Phrase: "{phrase}" - Predicted Sentiment: {sentiment}')

Phrase: "I love this product!" - Predicted Sentiment: 1
Phrase: "This is terrible. I hate it." - Predicted Sentiment: 0


In [37]:
!python model.py

found 0 physical cores < 1
  File "C:\Users\bhara\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 245, in _count_physical_cores
    raise ValueError(


In [32]:
newpipe = joblib.load(open('model.joblib','rb'))

In [33]:
type(newpipe)

imblearn.pipeline.Pipeline

In [34]:
print(newpipe.predict(pd.Series('awesome place'))[0])
print(newpipe.predict(pd.Series('terrible!'))[0])
print(newpipe.predict(pd.Series('very interesting place'))[0])

1
0
1


In [35]:
joblib.__version__

'1.1.1'

- requirements.txt
- app.py
- Procfile
- model.joblib
- utils.py
- templates/ (folder containing index.html)
- static/ (folder containing css/style.css)

In [None]:
!python app.py



---



---



> > > > > > > > > © 2023 Institute of Data


---



---



